Enhance handling on-going patch strategy
This commit includes: 1. Rebuild the region one patch cache during the service reload. 2. Ignore the patch in progress alarm as this could be a patch orchestration retry. 3. Re-create workers if they are cleard after a service restart. 4. Properly handle reboot-required patching in case both system controller and subcloud patch orchestration is done in one single strategy. With these improvement, the patch strategy can continue after a service reload. Test plan(passed): 1. Verify successful patch orchestration of an RR patch when both system controller and subclouds are patched in the same strategy. 2. Induce a 300.005 alarm (mgmt-affecting) in a subcloud, verify that orchestrated patching fails for that subcloud. 3. Induce a 900.001 alarm by partially apply a patch in a subcloud beforehand, verify that orchestrated patching completes for that subcloud. 4. Induce process restart in the middle of a subcloud patch orchestration, verify that transitional strategy steps are set to failed and the subclouds still in "initial" state can continue. 5. Induce process restart in the middle of a system controller patch orchestration, verify that system controller patching can resume and complete. Closes-Bug: 1979097 Signed-off-by: Yuxing Jiang<Yuxing.Jiang@windriver.com> Change-Id: I1b70d14b77c3e1be6301f011baff297502b9108b
This commit is contained in:
parent
1242c26b9e
commit
29fe24acb3
@ -24,6 +24,7 @@ from keystoneauth1 import exceptions as keystone_exceptions
|
||||
from oslo_log import log as logging
|
||||
|
||||
from dccommon import consts as dccommon_consts
|
||||
from dccommon.drivers.openstack.fm import FmClient
|
||||
from dccommon.drivers.openstack import patching_v1
|
||||
from dccommon.drivers.openstack.patching_v1 import PatchingClient
|
||||
from dccommon.drivers.openstack.sdk_platform import OpenStackDriver
|
||||
@ -39,6 +40,8 @@ from dcmanager.db import api as db_api
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
IGNORE_ALARMS = ['900.001', ] # Patch in progress
|
||||
|
||||
|
||||
class PatchOrchThread(threading.Thread):
|
||||
"""Patch Orchestration Thread
|
||||
@ -87,6 +90,8 @@ class PatchOrchThread(threading.Thread):
|
||||
|
||||
def run(self):
|
||||
LOG.info("PatchOrchThread Starting")
|
||||
# Build region one patches cache whenever the service is reloaded
|
||||
self.get_region_one_patches()
|
||||
self.patch_orch()
|
||||
# Stop any greenthreads that are still running
|
||||
self.thread_group_manager.stop()
|
||||
@ -119,6 +124,12 @@ class PatchOrchThread(threading.Thread):
|
||||
return vim.VimClient(region_name, ks_client.session,
|
||||
endpoint=ks_client.endpoint_cache.get_endpoint('vim'))
|
||||
|
||||
# TODO(yuxing) need to remove this function after the ctgs client accept
|
||||
# alarm_ignore_list.
|
||||
def get_fm_client(self, region_name):
|
||||
ks_client = self.get_ks_client(region_name)
|
||||
return FmClient(region_name, ks_client.session)
|
||||
|
||||
@staticmethod
|
||||
def get_region_name(strategy_step):
|
||||
"""Get the region name for a strategy step"""
|
||||
@ -190,8 +201,8 @@ class PatchOrchThread(threading.Thread):
|
||||
|
||||
if sw_update_strategy.type == consts.SW_UPDATE_TYPE_PATCH:
|
||||
if sw_update_strategy.state in [
|
||||
consts.SW_UPDATE_STATE_APPLYING,
|
||||
consts.SW_UPDATE_STATE_ABORTING]:
|
||||
consts.SW_UPDATE_STATE_APPLYING,
|
||||
consts.SW_UPDATE_STATE_ABORTING]:
|
||||
self.apply(sw_update_strategy)
|
||||
elif sw_update_strategy.state == \
|
||||
consts.SW_UPDATE_STATE_ABORT_REQUESTED:
|
||||
@ -213,14 +224,17 @@ class PatchOrchThread(threading.Thread):
|
||||
|
||||
LOG.info("PatchOrchThread ended main loop")
|
||||
|
||||
def pre_check_management_affected_alarm(self, strategy_step):
|
||||
def pre_check_management_affected_alarm(self, subcloud_name):
|
||||
# The health conditions acceptable for subcloud patching are:
|
||||
# a) subcloud is completely healthy (i.e. no failed checks)
|
||||
# b) there is alarm but no management affected alarm
|
||||
# c) subcloud fails alarm check and it only has non-management
|
||||
# affecting alarm(s)
|
||||
system_health = self.get_sysinv_client(
|
||||
strategy_step.subcloud.name).get_system_health()
|
||||
# d) subcloud fails alarm check but the alarms are in the
|
||||
# IGNORE_ALARMS list
|
||||
# TODO(yuxing) Update the cgtsclient and the sysinv client driver to
|
||||
# accept alarm_ignore_list to avoid retrieving alarms from FM client.
|
||||
system_health = self.get_sysinv_client(subcloud_name).get_system_health()
|
||||
|
||||
failed_alarm_check = re.findall("No alarms: \[Fail\]", system_health)
|
||||
no_mgmt_alarms = re.findall("\[0\] of which are management affecting",
|
||||
@ -228,7 +242,16 @@ class PatchOrchThread(threading.Thread):
|
||||
if not failed_alarm_check or no_mgmt_alarms:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
alarms = self.get_fm_client(subcloud_name).get_alarms()
|
||||
for alarm in alarms:
|
||||
# This alarm cannot be ignored
|
||||
|
||||
if (alarm.mgmt_affecting == "True") and (
|
||||
alarm.alarm_id not in IGNORE_ALARMS):
|
||||
return False
|
||||
# Either the non-management affecting alarms or the skippable alarm
|
||||
# can be ignored, return true
|
||||
return True
|
||||
|
||||
def apply(self, sw_update_strategy):
|
||||
"""Apply a patch strategy"""
|
||||
@ -362,14 +385,26 @@ class PatchOrchThread(threading.Thread):
|
||||
|
||||
elif strategy_step.state == \
|
||||
consts.STRATEGY_STATE_UPDATING_PATCHES:
|
||||
if region in self.subcloud_workers:
|
||||
if region not in self.subcloud_workers:
|
||||
# The worker is missed, caused by host swact or service
|
||||
# reload.
|
||||
self._create_worker_thread(
|
||||
region, strategy_step.state, strategy_step,
|
||||
self.update_subcloud_patches)
|
||||
else:
|
||||
# The update is in progress
|
||||
LOG.debug("Update patches is in progress for %s."
|
||||
% region)
|
||||
|
||||
elif strategy_step.state == \
|
||||
consts.STRATEGY_STATE_CREATING_STRATEGY:
|
||||
if self.subcloud_workers[region][0] != \
|
||||
if region not in self.subcloud_workers:
|
||||
# The worker is missed, caused by host swact or service
|
||||
# reload.
|
||||
self._create_worker_thread(
|
||||
region, consts.STRATEGY_STATE_CREATING_STRATEGY,
|
||||
strategy_step, self.create_subcloud_strategy)
|
||||
elif self.subcloud_workers[region][0] != \
|
||||
consts.STRATEGY_STATE_CREATING_STRATEGY:
|
||||
self._create_worker_thread(
|
||||
region, consts.STRATEGY_STATE_CREATING_STRATEGY,
|
||||
@ -379,7 +414,13 @@ class PatchOrchThread(threading.Thread):
|
||||
% region)
|
||||
elif strategy_step.state == \
|
||||
consts.STRATEGY_STATE_APPLYING_STRATEGY:
|
||||
if self.subcloud_workers[region][0] != \
|
||||
if region not in self.subcloud_workers:
|
||||
# The worker is missed, caused by host swact or service
|
||||
# reload.
|
||||
self._create_worker_thread(
|
||||
region, consts.STRATEGY_STATE_APPLYING_STRATEGY,
|
||||
strategy_step, self.apply_subcloud_strategy)
|
||||
elif self.subcloud_workers[region][0] != \
|
||||
consts.STRATEGY_STATE_APPLYING_STRATEGY:
|
||||
self._create_worker_thread(
|
||||
region, consts.STRATEGY_STATE_APPLYING_STRATEGY,
|
||||
@ -389,7 +430,13 @@ class PatchOrchThread(threading.Thread):
|
||||
% region)
|
||||
elif strategy_step.state == \
|
||||
consts.STRATEGY_STATE_FINISHING:
|
||||
if self.subcloud_workers[region][0] != \
|
||||
if region not in self.subcloud_workers:
|
||||
# The worker is missed, caused by host swact or service
|
||||
# reload.
|
||||
self._create_worker_thread(
|
||||
region, consts.STRATEGY_STATE_FINISHING,
|
||||
strategy_step, self.finish)
|
||||
elif self.subcloud_workers[region][0] != \
|
||||
consts.STRATEGY_STATE_FINISHING:
|
||||
self._create_worker_thread(
|
||||
region, consts.STRATEGY_STATE_FINISHING,
|
||||
@ -430,7 +477,8 @@ class PatchOrchThread(threading.Thread):
|
||||
error_msg = None
|
||||
try:
|
||||
# If management affected alarm check failed
|
||||
if not self.pre_check_management_affected_alarm(strategy_step):
|
||||
if not self.pre_check_management_affected_alarm(
|
||||
strategy_step.subcloud.name):
|
||||
error_msg = ("Subcloud %s contains one or more management "
|
||||
"affecting alarm(s). It will not be patched. "
|
||||
"Please resolve the alarm condition(s) and try again."
|
||||
@ -1252,6 +1300,7 @@ class PatchOrchThread(threading.Thread):
|
||||
|
||||
def _create_worker_thread(self, region, state, strategy_step, state_op):
|
||||
if region in self.subcloud_workers:
|
||||
# Worker is not in the right state, delete it.
|
||||
del self.subcloud_workers[region]
|
||||
|
||||
self.subcloud_workers[region] = \
|
||||
|
@ -32,6 +32,7 @@ from dcmanager.orchestrator import sw_update_manager
|
||||
from dcmanager.tests import base
|
||||
from dcmanager.tests.unit.common import fake_strategy
|
||||
from dcmanager.tests.unit.common import fake_subcloud
|
||||
from dcmanager.tests.unit.orchestrator.states.fakes import FakeAlarm
|
||||
from dcmanager.tests import utils
|
||||
|
||||
|
||||
@ -503,6 +504,26 @@ class FakeSysinvClientMgmtAffectAlarm(object):
|
||||
return self.health_report
|
||||
|
||||
|
||||
class FakeFMClientIgnoredAlarm(object):
|
||||
def __init__(self, region, session):
|
||||
self.region = region
|
||||
self.session = session
|
||||
self.alarm_list = [FakeAlarm('900.001', 'True')]
|
||||
|
||||
def get_alarms(self):
|
||||
return self.alarm_list
|
||||
|
||||
|
||||
class FakeFMClientAlarm(object):
|
||||
def __init__(self, region, session):
|
||||
self.region = region
|
||||
self.session = session
|
||||
self.alarm_list = [FakeAlarm('100.001', 'True'), FakeAlarm('100.002', 'True')]
|
||||
|
||||
def get_alarms(self):
|
||||
return self.alarm_list
|
||||
|
||||
|
||||
class Controller(object):
|
||||
def __init__(self, hostname):
|
||||
self.hostname = hostname
|
||||
@ -1973,13 +1994,14 @@ class TestSwUpdateManager(base.DCManagerTestCase):
|
||||
self.assertEqual(updated_strategy_steps[0]['state'],
|
||||
consts.STRATEGY_STATE_CREATING_STRATEGY)
|
||||
|
||||
@mock.patch.object(patch_orch_thread, 'FmClient')
|
||||
@mock.patch.object(patch_orch_thread, 'SysinvClient')
|
||||
@mock.patch.object(os_path, 'isfile')
|
||||
@mock.patch.object(patch_orch_thread, 'PatchingClient')
|
||||
@mock.patch.object(threading, 'Thread')
|
||||
def test_update_subcloud_patches_management_affected_alarm(
|
||||
self, mock_threading,
|
||||
mock_patching_client, mock_os_path_isfile, mock_sysinv_client):
|
||||
self, mock_threading, mock_patching_client, mock_os_path_isfile,
|
||||
mock_sysinv_client, mock_fm_client):
|
||||
|
||||
subcloud_id = fake_subcloud.create_fake_subcloud(self.ctx).id
|
||||
subcloud = db_api.subcloud_update(
|
||||
@ -1996,6 +2018,7 @@ class TestSwUpdateManager(base.DCManagerTestCase):
|
||||
mock_os_path_isfile.return_value = True
|
||||
mock_patching_client.side_effect = FakePatchingClientAvailable
|
||||
mock_sysinv_client.side_effect = FakeSysinvClientMgmtAffectAlarm
|
||||
mock_fm_client.return_value = FakeFMClientAlarm('fake_region', 'fake_session')
|
||||
|
||||
FakePatchingClientAvailable.apply = mock.Mock()
|
||||
|
||||
@ -2014,6 +2037,49 @@ class TestSwUpdateManager(base.DCManagerTestCase):
|
||||
self.assertEqual(updated_strategy_steps[0]['state'],
|
||||
consts.STRATEGY_STATE_FAILED)
|
||||
|
||||
@mock.patch.object(patch_orch_thread, 'FmClient')
|
||||
@mock.patch.object(patch_orch_thread, 'SysinvClient')
|
||||
@mock.patch.object(os_path, 'isfile')
|
||||
@mock.patch.object(patch_orch_thread, 'PatchingClient')
|
||||
@mock.patch.object(threading, 'Thread')
|
||||
def test_update_subcloud_patches_ignored_alarm(
|
||||
self, mock_threading, mock_patching_client, mock_os_path_isfile,
|
||||
mock_sysinv_client, mock_fm_client):
|
||||
|
||||
subcloud_id = fake_subcloud.create_fake_subcloud(self.ctx).id
|
||||
subcloud = db_api.subcloud_update(
|
||||
self.ctx,
|
||||
subcloud_id,
|
||||
management_state=dccommon_consts.MANAGEMENT_MANAGED,
|
||||
availability_status=dccommon_consts.AVAILABILITY_ONLINE)
|
||||
fake_strategy.create_fake_strategy_step(
|
||||
self.ctx,
|
||||
subcloud_id=subcloud.id,
|
||||
state=consts.STRATEGY_STATE_INITIAL)
|
||||
strategy_step = db_api.strategy_step_get_by_name(self.ctx, subcloud.name)
|
||||
|
||||
mock_os_path_isfile.return_value = True
|
||||
mock_patching_client.side_effect = FakePatchingClientAvailable
|
||||
mock_sysinv_client.side_effect = FakeSysinvClientMgmtAffectAlarm
|
||||
mock_fm_client.return_value = FakeFMClientIgnoredAlarm('fake_region', 'fake_session')
|
||||
|
||||
FakePatchingClientAvailable.apply = mock.Mock()
|
||||
|
||||
sw_update_manager.PatchOrchThread.stopped = lambda x: False
|
||||
mock_strategy_lock = mock.Mock()
|
||||
pot = sw_update_manager.PatchOrchThread(mock_strategy_lock,
|
||||
self.fake_dcmanager_audit_api)
|
||||
pot.get_ks_client = mock.Mock()
|
||||
|
||||
# invoke get_region_one_patches once t update required attributes
|
||||
pot.get_region_one_patches()
|
||||
pot.update_subcloud_patches(strategy_step)
|
||||
|
||||
# Verify that strategy step was updated
|
||||
updated_strategy_steps = db_api.strategy_step_get_all(self.ctx)
|
||||
self.assertEqual(updated_strategy_steps[0]['state'],
|
||||
consts.STRATEGY_STATE_CREATING_STRATEGY)
|
||||
|
||||
@mock.patch.object(patch_orch_thread, 'SysinvClient')
|
||||
@mock.patch.object(os_path, 'isfile')
|
||||
@mock.patch.object(patch_orch_thread, 'PatchingClient')
|
||||
|
Loading…
Reference in New Issue
Block a user