Refactor upgrade re-tries on failure path

If image upgrade or downgrade fails for tiller or kubernetes
networking, the sysinv conductor will re-try in an hour until
success.

Story: 2006590
Task: 36942
Change-Id: Ia3265ecb5a26db4be7778408fbd0a0d07c75e84a
Signed-off-by: Kristine Bujold <kristine.bujold@windriver.com>
This commit is contained in:
Kristine Bujold 2019-10-17 09:37:59 -04:00
parent c0e09524c9
commit 50f2d8c5aa
5 changed files with 62 additions and 30 deletions

View File

@ -1,2 +1,2 @@
SRC_DIR="sysinv"
TIS_PATCH_VER=336
TIS_PATCH_VER=337

View File

@ -128,6 +128,9 @@ conductor_opts = [
cfg.IntOpt('managed_app_auto_recovery_interval',
default=300,
help='Interval to run managed app auto recovery'),
cfg.IntOpt('kube_upgrade_downgrade_retry_interval',
default=3600,
help='Interval in seconds between retries to upgrade/downgrade kubernetes components'),
]
CONF = cfg.CONF
@ -5157,6 +5160,8 @@ class ConductorManager(service.PeriodicService):
self._upgrade_downgrade_tiller()
self._upgrade_downgrade_kube_networking()
@retry(retry_on_result=lambda x: x is False,
wait_fixed=(CONF.conductor.kube_upgrade_downgrade_retry_interval * 1000))
def _upgrade_downgrade_tiller(self):
"""Check if tiller needs to be upgraded or downgraded"""
LOG.info("_upgrade_downgrade_tiller")
@ -5197,9 +5202,10 @@ class ConductorManager(service.PeriodicService):
if running_image is None:
LOG.warning("Failed to get tiller image")
return
return False
LOG.info("Running tiller image: %s" % running_image)
LOG.info("Requested tiller version: %s" % image_versions.TILLER_IMAGE_VERSION)
# Grab the version from the image name. Version is preceded
# by a ":" e.g.
@ -5207,7 +5213,7 @@ class ConductorManager(service.PeriodicService):
running_image_name, running_version = running_image.rsplit(":", 1)
if not running_version:
LOG.warning("Failed to get version from tiller image")
return
return False
# Verify the tiller version running
if running_version != image_versions.TILLER_IMAGE_VERSION:
@ -5219,39 +5225,36 @@ class ConductorManager(service.PeriodicService):
local_registry_auth = cutils.get_local_docker_registry_auth()
self._docker._retrieve_specified_registries()
# download the image, retry if it fails
while True:
try:
ret = self._docker.download_an_image("helm",
local_registry_auth,
download_image)
if not ret:
raise Exception
except Exception as e:
LOG.warning(
"Failed to download image '%s'. %s" %
(download_image, e))
greenthread.sleep(FIVE_MIN_IN_SECS)
continue
break
# download the image
try:
img_tag, ret = self._docker.download_an_image("helm",
local_registry_auth,
download_image)
if not ret:
raise Exception
except Exception as e:
LOG.warning("Failed to download image '%s'. %s" % (download_image, e))
return False
# reset the cached registries
self._docker._reset_registries_info()
# Update the new image, retry if it fails
while True:
try:
helm_utils.helm_upgrade_tiller(download_image)
# Update the new image
try:
helm_utils.helm_upgrade_tiller(download_image)
except Exception as e:
LOG.warning("Failed to update the new image: %s" % e)
greenthread.sleep(FIVE_MIN_IN_SECS)
continue
break
except Exception as e:
LOG.warning("Failed to update the new image: %s" % e)
return False
except Exception as e:
LOG.error("{}. Failed to upgrade/downgrade tiller.".format(e))
return False
return True
@retry(retry_on_result=lambda x: x is False,
wait_fixed=(CONF.conductor.kube_upgrade_downgrade_retry_interval * 1000))
def _upgrade_downgrade_kube_networking(self):
try:
LOG.info(
@ -5271,6 +5274,9 @@ class ConductorManager(service.PeriodicService):
except Exception as e:
LOG.error("Failed to upgrade/downgrade kubernetes "
"networking images: {}".format(e))
return False
return True
def check_nodes_stable(self):
hosts = self.dbapi.ihost_get_list()

View File

@ -2,7 +2,7 @@
# -*- encoding: utf-8 -*-
#
#
# Copyright (c) 2017-2018 Wind River Systems, Inc.
# Copyright (c) 2017-2019 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
@ -546,6 +546,9 @@ class StorageTierDependentTCs(base.FunctionalTest):
set_monitors_status_patcher = mock.patch.object(ceph_utils.CephApiOperator, 'get_monitors_status')
set_is_initial_config_patcher = mock.patch.object(cutils, 'is_initial_config_complete')
upgrade_downgrade_kube_components_patcher = mock.patch.object(
manager.ConductorManager, '_upgrade_downgrade_kube_components')
def setUp(self):
super(StorageTierDependentTCs, self).setUp()
self.mock_set_crushmap = self.set_crushmap_patcher.start()
@ -563,11 +566,14 @@ class StorageTierDependentTCs(base.FunctionalTest):
self.host_index = -1
self.mon_index = -1
self.mock_upgrade_downgrade_kube_components = self.upgrade_downgrade_kube_components_patcher.start()
def tearDown(self):
super(StorageTierDependentTCs, self).tearDown()
self.set_crushmap_patcher.stop()
self.set_monitors_status_patcher = self.set_monitors_status_patcher.stop()
self.set_is_initial_config_patcher.stop()
self.upgrade_downgrade_kube_components_patcher.stop()
def assertDeleted(self, fullPath):
self.get_json(fullPath, expect_errors=True) # Make sure this line raises an error

View File

@ -1,7 +1,7 @@
# vim: tabstop=4 shiftwidth=4 softtabstop=4
# coding=utf-8
# Copyright (c) 2017-2018 Wind River Systems, Inc.
# Copyright (c) 2017-2019 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
@ -40,6 +40,9 @@ class UpdateCephCluster(base.DbTestCase):
# - test_add_valid_mix_tiers
# - test_add_4_mix_bbbb
upgrade_downgrade_kube_components_patcher = mock.patch.object(
manager.ConductorManager, '_upgrade_downgrade_kube_components')
def setUp(self):
super(UpdateCephCluster, self).setUp()
self.service = manager.ConductorManager('test-host', 'test-topic')
@ -50,6 +53,12 @@ class UpdateCephCluster(base.DbTestCase):
self.load = utils.create_test_load()
self.host_index = -1
self.mock_upgrade_downgrade_kube_components = self.upgrade_downgrade_kube_components_patcher.start()
def tearDown(self):
super(UpdateCephCluster, self).tearDown()
self.upgrade_downgrade_kube_components_patcher.stop()
def _create_storage_ihost(self, hostname):
self.host_index += 1
ihost_dict = utils.get_test_ihost(

View File

@ -17,11 +17,13 @@
# License for the specific language governing permissions and limitations
# under the License.
#
# Copyright (c) 2013-2016 Wind River Systems, Inc.
# Copyright (c) 2013-2019 Wind River Systems, Inc.
#
"""Test class for Sysinv ManagerService."""
import mock
from sysinv.common import exception
from sysinv.conductor import manager
from sysinv.db import api as dbapi
@ -32,6 +34,9 @@ from sysinv.tests.db import utils
class ManagerTestCase(base.DbTestCase):
upgrade_downgrade_kube_components_patcher = mock.patch.object(
manager.ConductorManager, '_upgrade_downgrade_kube_components')
def setUp(self):
super(ManagerTestCase, self).setUp()
self.service = manager.ConductorManager('test-host', 'test-topic')
@ -41,6 +46,12 @@ class ManagerTestCase(base.DbTestCase):
self.system = utils.create_test_isystem()
self.load = utils.create_test_load()
self.mock_upgrade_downgrade_kube_components = self.upgrade_downgrade_kube_components_patcher.start()
def tearDown(self):
super(ManagerTestCase, self).tearDown()
self.upgrade_downgrade_kube_components_patcher.stop()
def _create_test_ihost(self, **kwargs):
# ensure the system ID for proper association
kwargs['forisystemid'] = self.system['id']