From 1d2e429800bc440306fb3dc1d518fcdb6683144c Mon Sep 17 00:00:00 2001 From: "Vanathi.Selvaraju" Date: Wed, 27 Mar 2024 15:53:55 -0400 Subject: [PATCH] sw-manager patch-strategy failed to install due to timeout As part of this fix, alarm clear wait step checks for stale alarm 750.006 for 30mins. If the alarm is still no cleard, patch-strategy ignores the alarm. In the current case alarm 750.006 is not getting cleared so the patch-strategy times out. Test Plan: PASSED: Applying a patch - On DX system, Create and apply patch strategy, fm alarm-list to have an uncleared alarm(for test purpose 100.103 - Memory threshold alarm was used). After 30mins alarm was ignored and patch strategy successfully applied. PASSED: Removing a patch - On DX system, Create and apply patch strategy, fm alarm-list to have an uncleared alarm(for test purpose 100.103 - Memory threshold alarm was used). After 30mins alarm was ignored and patch strategy successfully applied. todo: to test with the actual alarm 750.006 in the lab setup. Closes-Bug: 2059305 Change-Id: I7ebaf5a24fa45a7e45f3af7e5ca588ce3ee06156 Signed-off-by: Vanathi.Selvaraju --- nfv/nfv-vim/nfv_vim/strategy/_strategy.py | 25 +++++++---- .../nfv_vim/strategy/_strategy_steps.py | 41 +++++++++++++++++-- 2 files changed, 55 insertions(+), 11 deletions(-) diff --git a/nfv/nfv-vim/nfv_vim/strategy/_strategy.py b/nfv/nfv-vim/nfv_vim/strategy/_strategy.py index 2517e385..7e8c8df1 100755 --- a/nfv/nfv-vim/nfv_vim/strategy/_strategy.py +++ b/nfv/nfv-vim/nfv_vim/strategy/_strategy.py @@ -992,8 +992,9 @@ class UpdateControllerHostsMixin(object): # OSDs configured, but the alarms should clear quickly in # that case so this will not delay the update strategy. stage.add_step(strategy.WaitAlarmsClearStep( - timeout_in_secs=30 * 60, - ignore_alarms=self._ignore_alarms)) + timeout_in_secs=40 * 60, + ignore_alarms=self._ignore_alarms, + ignore_alarms_conditional=self._ignore_alarms_conditional)) else: # Less time required if host is not rebooting stage.add_step(strategy.SystemStabilizeStep( @@ -1004,7 +1005,8 @@ class UpdateControllerHostsMixin(object): host_list = [local_host] stage = strategy.StrategyStage(strategy_stage_name) stage.add_step(strategy.QueryAlarmsStep( - True, ignore_alarms=self._ignore_alarms)) + True, ignore_alarms=self._ignore_alarms, + ignore_alarms_conditional=self._ignore_alarms_conditional)) if reboot: stage.add_step(strategy.SwactHostsStep(host_list)) stage.add_step(strategy.LockHostsStep(host_list)) @@ -1025,8 +1027,9 @@ class UpdateControllerHostsMixin(object): # OSDs configured, but the alarms should clear quickly in # that case so this will not delay the update strategy. stage.add_step(strategy.WaitAlarmsClearStep( - timeout_in_secs=30 * 60, - ignore_alarms=self._ignore_alarms)) + timeout_in_secs=40 * 60, + ignore_alarms=self._ignore_alarms, + ignore_alarms_conditional=self._ignore_alarms_conditional)) else: # Less time required if host is not rebooting stage.add_step(strategy.SystemStabilizeStep( @@ -1297,8 +1300,9 @@ class UpdateWorkerHostsMixin(object): for host in hosts_to_lock + hosts_to_reboot]): # Multiple personality nodes that need to wait for OSDs to sync: stage.add_step(strategy.WaitAlarmsClearStep( - timeout_in_secs=30 * 60, - ignore_alarms=self._ignore_alarms)) + timeout_in_secs=40 * 60, + ignore_alarms=self._ignore_alarms, + ignore_alarms_conditional=self._ignore_alarms_conditional)) else: if any([host.openstack_control or host.openstack_compute for host in hosts_to_lock + hosts_to_reboot]): @@ -1393,9 +1397,13 @@ class SwPatchStrategy(SwUpdateStrategy, '100.119', # PTP alarm for SyncE '900.701', # Node tainted ] + IGNORE_ALARMS_CONDITIONAL = {'100.103': 1800} self._ignore_alarms += IGNORE_ALARMS self._single_controller = single_controller + # This is only for patch strategy to ignore 750.006 alarm when it becomes stale + self._ignore_alarms_conditional = IGNORE_ALARMS_CONDITIONAL + # initialize the variables required by the mixins # ie: self._nfvi_sw_patches, self._nfvi_sw_patch_hosts self.initialize_mixin() @@ -1409,7 +1417,8 @@ class SwPatchStrategy(SwUpdateStrategy, stage = strategy.StrategyStage( strategy.STRATEGY_STAGE_NAME.SW_PATCH_QUERY) stage.add_step( - strategy.QueryAlarmsStep(ignore_alarms=self._ignore_alarms)) + strategy.QueryAlarmsStep(ignore_alarms=self._ignore_alarms, + ignore_alarms_conditional=self._ignore_alarms_conditional)) stage.add_step(strategy.QuerySwPatchesStep()) stage.add_step(strategy.QuerySwPatchHostsStep()) self.build_phase.add_stage(stage) diff --git a/nfv/nfv-vim/nfv_vim/strategy/_strategy_steps.py b/nfv/nfv-vim/nfv_vim/strategy/_strategy_steps.py index a55df5ac..f4cdb0bd 100755 --- a/nfv/nfv-vim/nfv_vim/strategy/_strategy_steps.py +++ b/nfv/nfv-vim/nfv_vim/strategy/_strategy_steps.py @@ -1912,13 +1912,17 @@ class QueryAlarmsStep(strategy.StrategyStep): """ Query Alarms - Strategy Step """ - def __init__(self, fail_on_alarms=False, ignore_alarms=None): + def __init__(self, fail_on_alarms=False, ignore_alarms=None, ignore_alarms_conditional=None): super(QueryAlarmsStep, self).__init__( STRATEGY_STEP_NAME.QUERY_ALARMS, timeout_in_secs=60) if ignore_alarms is None: ignore_alarms = [] self._fail_on_alarms = fail_on_alarms self._ignore_alarms = ignore_alarms + # For ignoring 750.006 alarm for patch strategy + if ignore_alarms_conditional is None: + ignore_alarms_conditional = [] + self._ignore_alarms_conditional = ignore_alarms_conditional @coroutine def _query_alarms_callback(self, fm_service): @@ -1940,7 +1944,8 @@ class QueryAlarmsStep(strategy.StrategyStep): "%s - uuid %s due to relaxed alarm " "strictness" % (nfvi_alarm.alarm_id, nfvi_alarm.alarm_uuid)) - elif nfvi_alarm.alarm_id not in self._ignore_alarms: + elif (nfvi_alarm.alarm_id not in self._ignore_alarms and + nfvi_alarm.alarm_id not in self._ignore_alarms_conditional): DLOG.warn("Alarm: %s" % nfvi_alarm.alarm_id) nfvi_alarms.append(nfvi_alarm) else: @@ -2106,7 +2111,8 @@ class WaitAlarmsClearStep(strategy.StrategyStep): """ Alarm Wait - Strategy Step """ - def __init__(self, timeout_in_secs=300, first_query_delay_in_secs=60, ignore_alarms=None): + def __init__(self, timeout_in_secs=300, first_query_delay_in_secs=60, ignore_alarms=None, + ignore_alarms_conditional=None): super(WaitAlarmsClearStep, self).__init__( STRATEGY_STEP_NAME.WAIT_ALARMS_CLEAR, timeout_in_secs=timeout_in_secs) self._first_query_delay_in_secs = first_query_delay_in_secs @@ -2115,12 +2121,17 @@ class WaitAlarmsClearStep(strategy.StrategyStep): self._ignore_alarms = ignore_alarms self._wait_time = 0 self._query_inprogress = False + if ignore_alarms_conditional is None: + ignore_alarms_conditional = {} + self._ignore_alarms_conditional = ignore_alarms_conditional @coroutine def _query_alarms_callback(self): """ Query Alarms Callback """ + from datetime import datetime + response = (yield) DLOG.debug("Query-Alarms callback response=%s." % response) @@ -2138,6 +2149,25 @@ class WaitAlarmsClearStep(strategy.StrategyStep): "strictness" % (nfvi_alarm.alarm_id, nfvi_alarm.alarm_uuid)) elif nfvi_alarm.alarm_id not in self._ignore_alarms: + # For ignoring 750.006 alarm for patch strategy + if nfvi_alarm.alarm_id in self._ignore_alarms_conditional: + format_string = "%Y-%m-%dT%H:%M:%S.%f" + alarm_timestamp = nfvi_alarm.timestamp + alarm_timestamp_obj = datetime.strptime( + alarm_timestamp, format_string) + current_time = datetime.now() + time_in_sec = ( + current_time - alarm_timestamp_obj).total_seconds() + + # Ignore 750.006 alarm if present for more than 30 mins(1800s) + if self._ignore_alarms_conditional[nfvi_alarm.alarm_id] < int(time_in_sec): + self._ignore_alarms.append( + list(self._ignore_alarms_conditional.keys())[0]) + else: + nfvi_alarms.append(nfvi_alarm) + else: + nfvi_alarms.append(nfvi_alarm) + nfvi_alarms.append(nfvi_alarm) else: DLOG.debug("Ignoring alarm %s - uuid %s" % @@ -2145,6 +2175,9 @@ class WaitAlarmsClearStep(strategy.StrategyStep): self.strategy.nfvi_alarms = nfvi_alarms if self.strategy.nfvi_alarms: + for alarm in self.strategy.nfvi_alarms: + if alarm['alarm_id'] == list(self._ignore_alarms_conditional.keys())[0]: + self.strategy.nfvi_alarms.remove(alarm) # Keep waiting for alarms to clear pass else: @@ -2193,6 +2226,7 @@ class WaitAlarmsClearStep(strategy.StrategyStep): super(WaitAlarmsClearStep, self).from_dict(data) self._first_query_delay_in_secs = data['first_query_delay_in_secs'] self._ignore_alarms = data['ignore_alarms'] + self._ignore_alarms_conditional = data['ignore_alarms_conditional'] self._wait_time = 0 self._query_inprogress = False return self @@ -2207,6 +2241,7 @@ class WaitAlarmsClearStep(strategy.StrategyStep): data['entity_uuids'] = list() data['first_query_delay_in_secs'] = self._first_query_delay_in_secs data['ignore_alarms'] = self._ignore_alarms + data['ignore_alarms_conditional'] = self._ignore_alarms_conditional return data