Enhance handling on-going patch strategy

This commit includes:
1. Rebuild the region one patch cache during the service reload.
2. Ignore the patch in progress alarm as this could be a patch
orchestration retry.
3. Re-create workers if they are cleard after a service restart.
4. Properly handle reboot-required patching in case both system
controller and subcloud patch orchestration is done in one single
strategy.
With these improvement, the patch strategy can continue after a
service reload.

Test plan(passed):
1. Verify successful patch orchestration of an RR patch when both
system controller and subclouds are patched in the same strategy.
2. Induce a 300.005 alarm (mgmt-affecting) in a subcloud, verify
that orchestrated patching fails for that subcloud.
3. Induce a 900.001 alarm by partially apply a patch in a subcloud
beforehand, verify that orchestrated patching completes for that
subcloud.
4. Induce process restart in the middle of a subcloud patch
orchestration, verify that transitional strategy steps are set to
failed and the subclouds still in "initial" state can continue.
5. Induce process restart in the middle of a system controller patch
orchestration, verify that system controller patching can resume and
complete.

Closes-Bug: 1979097
Signed-off-by: Yuxing Jiang<Yuxing.Jiang@windriver.com>
Change-Id: I1b70d14b77c3e1be6301f011baff297502b9108b
This commit is contained in:
Yuxing Jiang 2022-06-21 11:57:02 -04:00
parent 1242c26b9e
commit 29fe24acb3
2 changed files with 128 additions and 13 deletions

View File

@ -24,6 +24,7 @@ from keystoneauth1 import exceptions as keystone_exceptions
from oslo_log import log as logging
from dccommon import consts as dccommon_consts
from dccommon.drivers.openstack.fm import FmClient
from dccommon.drivers.openstack import patching_v1
from dccommon.drivers.openstack.patching_v1 import PatchingClient
from dccommon.drivers.openstack.sdk_platform import OpenStackDriver
@ -39,6 +40,8 @@ from dcmanager.db import api as db_api
LOG = logging.getLogger(__name__)
IGNORE_ALARMS = ['900.001', ] # Patch in progress
class PatchOrchThread(threading.Thread):
"""Patch Orchestration Thread
@ -87,6 +90,8 @@ class PatchOrchThread(threading.Thread):
def run(self):
LOG.info("PatchOrchThread Starting")
# Build region one patches cache whenever the service is reloaded
self.get_region_one_patches()
self.patch_orch()
# Stop any greenthreads that are still running
self.thread_group_manager.stop()
@ -119,6 +124,12 @@ class PatchOrchThread(threading.Thread):
return vim.VimClient(region_name, ks_client.session,
endpoint=ks_client.endpoint_cache.get_endpoint('vim'))
# TODO(yuxing) need to remove this function after the ctgs client accept
# alarm_ignore_list.
def get_fm_client(self, region_name):
ks_client = self.get_ks_client(region_name)
return FmClient(region_name, ks_client.session)
@staticmethod
def get_region_name(strategy_step):
"""Get the region name for a strategy step"""
@ -190,8 +201,8 @@ class PatchOrchThread(threading.Thread):
if sw_update_strategy.type == consts.SW_UPDATE_TYPE_PATCH:
if sw_update_strategy.state in [
consts.SW_UPDATE_STATE_APPLYING,
consts.SW_UPDATE_STATE_ABORTING]:
consts.SW_UPDATE_STATE_APPLYING,
consts.SW_UPDATE_STATE_ABORTING]:
self.apply(sw_update_strategy)
elif sw_update_strategy.state == \
consts.SW_UPDATE_STATE_ABORT_REQUESTED:
@ -213,14 +224,17 @@ class PatchOrchThread(threading.Thread):
LOG.info("PatchOrchThread ended main loop")
def pre_check_management_affected_alarm(self, strategy_step):
def pre_check_management_affected_alarm(self, subcloud_name):
# The health conditions acceptable for subcloud patching are:
# a) subcloud is completely healthy (i.e. no failed checks)
# b) there is alarm but no management affected alarm
# c) subcloud fails alarm check and it only has non-management
# affecting alarm(s)
system_health = self.get_sysinv_client(
strategy_step.subcloud.name).get_system_health()
# d) subcloud fails alarm check but the alarms are in the
# IGNORE_ALARMS list
# TODO(yuxing) Update the cgtsclient and the sysinv client driver to
# accept alarm_ignore_list to avoid retrieving alarms from FM client.
system_health = self.get_sysinv_client(subcloud_name).get_system_health()
failed_alarm_check = re.findall("No alarms: \[Fail\]", system_health)
no_mgmt_alarms = re.findall("\[0\] of which are management affecting",
@ -228,7 +242,16 @@ class PatchOrchThread(threading.Thread):
if not failed_alarm_check or no_mgmt_alarms:
return True
else:
return False
alarms = self.get_fm_client(subcloud_name).get_alarms()
for alarm in alarms:
# This alarm cannot be ignored
if (alarm.mgmt_affecting == "True") and (
alarm.alarm_id not in IGNORE_ALARMS):
return False
# Either the non-management affecting alarms or the skippable alarm
# can be ignored, return true
return True
def apply(self, sw_update_strategy):
"""Apply a patch strategy"""
@ -362,14 +385,26 @@ class PatchOrchThread(threading.Thread):
elif strategy_step.state == \
consts.STRATEGY_STATE_UPDATING_PATCHES:
if region in self.subcloud_workers:
if region not in self.subcloud_workers:
# The worker is missed, caused by host swact or service
# reload.
self._create_worker_thread(
region, strategy_step.state, strategy_step,
self.update_subcloud_patches)
else:
# The update is in progress
LOG.debug("Update patches is in progress for %s."
% region)
elif strategy_step.state == \
consts.STRATEGY_STATE_CREATING_STRATEGY:
if self.subcloud_workers[region][0] != \
if region not in self.subcloud_workers:
# The worker is missed, caused by host swact or service
# reload.
self._create_worker_thread(
region, consts.STRATEGY_STATE_CREATING_STRATEGY,
strategy_step, self.create_subcloud_strategy)
elif self.subcloud_workers[region][0] != \
consts.STRATEGY_STATE_CREATING_STRATEGY:
self._create_worker_thread(
region, consts.STRATEGY_STATE_CREATING_STRATEGY,
@ -379,7 +414,13 @@ class PatchOrchThread(threading.Thread):
% region)
elif strategy_step.state == \
consts.STRATEGY_STATE_APPLYING_STRATEGY:
if self.subcloud_workers[region][0] != \
if region not in self.subcloud_workers:
# The worker is missed, caused by host swact or service
# reload.
self._create_worker_thread(
region, consts.STRATEGY_STATE_APPLYING_STRATEGY,
strategy_step, self.apply_subcloud_strategy)
elif self.subcloud_workers[region][0] != \
consts.STRATEGY_STATE_APPLYING_STRATEGY:
self._create_worker_thread(
region, consts.STRATEGY_STATE_APPLYING_STRATEGY,
@ -389,7 +430,13 @@ class PatchOrchThread(threading.Thread):
% region)
elif strategy_step.state == \
consts.STRATEGY_STATE_FINISHING:
if self.subcloud_workers[region][0] != \
if region not in self.subcloud_workers:
# The worker is missed, caused by host swact or service
# reload.
self._create_worker_thread(
region, consts.STRATEGY_STATE_FINISHING,
strategy_step, self.finish)
elif self.subcloud_workers[region][0] != \
consts.STRATEGY_STATE_FINISHING:
self._create_worker_thread(
region, consts.STRATEGY_STATE_FINISHING,
@ -430,7 +477,8 @@ class PatchOrchThread(threading.Thread):
error_msg = None
try:
# If management affected alarm check failed
if not self.pre_check_management_affected_alarm(strategy_step):
if not self.pre_check_management_affected_alarm(
strategy_step.subcloud.name):
error_msg = ("Subcloud %s contains one or more management "
"affecting alarm(s). It will not be patched. "
"Please resolve the alarm condition(s) and try again."
@ -1252,6 +1300,7 @@ class PatchOrchThread(threading.Thread):
def _create_worker_thread(self, region, state, strategy_step, state_op):
if region in self.subcloud_workers:
# Worker is not in the right state, delete it.
del self.subcloud_workers[region]
self.subcloud_workers[region] = \

View File

@ -32,6 +32,7 @@ from dcmanager.orchestrator import sw_update_manager
from dcmanager.tests import base
from dcmanager.tests.unit.common import fake_strategy
from dcmanager.tests.unit.common import fake_subcloud
from dcmanager.tests.unit.orchestrator.states.fakes import FakeAlarm
from dcmanager.tests import utils
@ -503,6 +504,26 @@ class FakeSysinvClientMgmtAffectAlarm(object):
return self.health_report
class FakeFMClientIgnoredAlarm(object):
def __init__(self, region, session):
self.region = region
self.session = session
self.alarm_list = [FakeAlarm('900.001', 'True')]
def get_alarms(self):
return self.alarm_list
class FakeFMClientAlarm(object):
def __init__(self, region, session):
self.region = region
self.session = session
self.alarm_list = [FakeAlarm('100.001', 'True'), FakeAlarm('100.002', 'True')]
def get_alarms(self):
return self.alarm_list
class Controller(object):
def __init__(self, hostname):
self.hostname = hostname
@ -1973,13 +1994,14 @@ class TestSwUpdateManager(base.DCManagerTestCase):
self.assertEqual(updated_strategy_steps[0]['state'],
consts.STRATEGY_STATE_CREATING_STRATEGY)
@mock.patch.object(patch_orch_thread, 'FmClient')
@mock.patch.object(patch_orch_thread, 'SysinvClient')
@mock.patch.object(os_path, 'isfile')
@mock.patch.object(patch_orch_thread, 'PatchingClient')
@mock.patch.object(threading, 'Thread')
def test_update_subcloud_patches_management_affected_alarm(
self, mock_threading,
mock_patching_client, mock_os_path_isfile, mock_sysinv_client):
self, mock_threading, mock_patching_client, mock_os_path_isfile,
mock_sysinv_client, mock_fm_client):
subcloud_id = fake_subcloud.create_fake_subcloud(self.ctx).id
subcloud = db_api.subcloud_update(
@ -1996,6 +2018,7 @@ class TestSwUpdateManager(base.DCManagerTestCase):
mock_os_path_isfile.return_value = True
mock_patching_client.side_effect = FakePatchingClientAvailable
mock_sysinv_client.side_effect = FakeSysinvClientMgmtAffectAlarm
mock_fm_client.return_value = FakeFMClientAlarm('fake_region', 'fake_session')
FakePatchingClientAvailable.apply = mock.Mock()
@ -2014,6 +2037,49 @@ class TestSwUpdateManager(base.DCManagerTestCase):
self.assertEqual(updated_strategy_steps[0]['state'],
consts.STRATEGY_STATE_FAILED)
@mock.patch.object(patch_orch_thread, 'FmClient')
@mock.patch.object(patch_orch_thread, 'SysinvClient')
@mock.patch.object(os_path, 'isfile')
@mock.patch.object(patch_orch_thread, 'PatchingClient')
@mock.patch.object(threading, 'Thread')
def test_update_subcloud_patches_ignored_alarm(
self, mock_threading, mock_patching_client, mock_os_path_isfile,
mock_sysinv_client, mock_fm_client):
subcloud_id = fake_subcloud.create_fake_subcloud(self.ctx).id
subcloud = db_api.subcloud_update(
self.ctx,
subcloud_id,
management_state=dccommon_consts.MANAGEMENT_MANAGED,
availability_status=dccommon_consts.AVAILABILITY_ONLINE)
fake_strategy.create_fake_strategy_step(
self.ctx,
subcloud_id=subcloud.id,
state=consts.STRATEGY_STATE_INITIAL)
strategy_step = db_api.strategy_step_get_by_name(self.ctx, subcloud.name)
mock_os_path_isfile.return_value = True
mock_patching_client.side_effect = FakePatchingClientAvailable
mock_sysinv_client.side_effect = FakeSysinvClientMgmtAffectAlarm
mock_fm_client.return_value = FakeFMClientIgnoredAlarm('fake_region', 'fake_session')
FakePatchingClientAvailable.apply = mock.Mock()
sw_update_manager.PatchOrchThread.stopped = lambda x: False
mock_strategy_lock = mock.Mock()
pot = sw_update_manager.PatchOrchThread(mock_strategy_lock,
self.fake_dcmanager_audit_api)
pot.get_ks_client = mock.Mock()
# invoke get_region_one_patches once t update required attributes
pot.get_region_one_patches()
pot.update_subcloud_patches(strategy_step)
# Verify that strategy step was updated
updated_strategy_steps = db_api.strategy_step_get_all(self.ctx)
self.assertEqual(updated_strategy_steps[0]['state'],
consts.STRATEGY_STATE_CREATING_STRATEGY)
@mock.patch.object(patch_orch_thread, 'SysinvClient')
@mock.patch.object(os_path, 'isfile')
@mock.patch.object(patch_orch_thread, 'PatchingClient')