Refactor upgrade re-tries on failure path
If image upgrade or downgrade fails for tiller or kubernetes networking, the sysinv conductor will re-try in an hour until success. Story: 2006590 Task: 36942 Change-Id: Ia3265ecb5a26db4be7778408fbd0a0d07c75e84a Signed-off-by: Kristine Bujold <kristine.bujold@windriver.com>
This commit is contained in:
parent
c0e09524c9
commit
50f2d8c5aa
|
@ -1,2 +1,2 @@
|
|||
SRC_DIR="sysinv"
|
||||
TIS_PATCH_VER=336
|
||||
TIS_PATCH_VER=337
|
||||
|
|
|
@ -128,6 +128,9 @@ conductor_opts = [
|
|||
cfg.IntOpt('managed_app_auto_recovery_interval',
|
||||
default=300,
|
||||
help='Interval to run managed app auto recovery'),
|
||||
cfg.IntOpt('kube_upgrade_downgrade_retry_interval',
|
||||
default=3600,
|
||||
help='Interval in seconds between retries to upgrade/downgrade kubernetes components'),
|
||||
]
|
||||
|
||||
CONF = cfg.CONF
|
||||
|
@ -5157,6 +5160,8 @@ class ConductorManager(service.PeriodicService):
|
|||
self._upgrade_downgrade_tiller()
|
||||
self._upgrade_downgrade_kube_networking()
|
||||
|
||||
@retry(retry_on_result=lambda x: x is False,
|
||||
wait_fixed=(CONF.conductor.kube_upgrade_downgrade_retry_interval * 1000))
|
||||
def _upgrade_downgrade_tiller(self):
|
||||
"""Check if tiller needs to be upgraded or downgraded"""
|
||||
LOG.info("_upgrade_downgrade_tiller")
|
||||
|
@ -5197,9 +5202,10 @@ class ConductorManager(service.PeriodicService):
|
|||
|
||||
if running_image is None:
|
||||
LOG.warning("Failed to get tiller image")
|
||||
return
|
||||
return False
|
||||
|
||||
LOG.info("Running tiller image: %s" % running_image)
|
||||
LOG.info("Requested tiller version: %s" % image_versions.TILLER_IMAGE_VERSION)
|
||||
|
||||
# Grab the version from the image name. Version is preceded
|
||||
# by a ":" e.g.
|
||||
|
@ -5207,7 +5213,7 @@ class ConductorManager(service.PeriodicService):
|
|||
running_image_name, running_version = running_image.rsplit(":", 1)
|
||||
if not running_version:
|
||||
LOG.warning("Failed to get version from tiller image")
|
||||
return
|
||||
return False
|
||||
|
||||
# Verify the tiller version running
|
||||
if running_version != image_versions.TILLER_IMAGE_VERSION:
|
||||
|
@ -5219,39 +5225,36 @@ class ConductorManager(service.PeriodicService):
|
|||
local_registry_auth = cutils.get_local_docker_registry_auth()
|
||||
self._docker._retrieve_specified_registries()
|
||||
|
||||
# download the image, retry if it fails
|
||||
while True:
|
||||
try:
|
||||
ret = self._docker.download_an_image("helm",
|
||||
local_registry_auth,
|
||||
download_image)
|
||||
if not ret:
|
||||
raise Exception
|
||||
except Exception as e:
|
||||
LOG.warning(
|
||||
"Failed to download image '%s'. %s" %
|
||||
(download_image, e))
|
||||
greenthread.sleep(FIVE_MIN_IN_SECS)
|
||||
continue
|
||||
break
|
||||
# download the image
|
||||
try:
|
||||
img_tag, ret = self._docker.download_an_image("helm",
|
||||
local_registry_auth,
|
||||
download_image)
|
||||
if not ret:
|
||||
raise Exception
|
||||
except Exception as e:
|
||||
LOG.warning("Failed to download image '%s'. %s" % (download_image, e))
|
||||
return False
|
||||
|
||||
# reset the cached registries
|
||||
self._docker._reset_registries_info()
|
||||
|
||||
# Update the new image, retry if it fails
|
||||
while True:
|
||||
try:
|
||||
helm_utils.helm_upgrade_tiller(download_image)
|
||||
# Update the new image
|
||||
try:
|
||||
helm_utils.helm_upgrade_tiller(download_image)
|
||||
|
||||
except Exception as e:
|
||||
LOG.warning("Failed to update the new image: %s" % e)
|
||||
greenthread.sleep(FIVE_MIN_IN_SECS)
|
||||
continue
|
||||
break
|
||||
except Exception as e:
|
||||
LOG.warning("Failed to update the new image: %s" % e)
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
LOG.error("{}. Failed to upgrade/downgrade tiller.".format(e))
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
@retry(retry_on_result=lambda x: x is False,
|
||||
wait_fixed=(CONF.conductor.kube_upgrade_downgrade_retry_interval * 1000))
|
||||
def _upgrade_downgrade_kube_networking(self):
|
||||
try:
|
||||
LOG.info(
|
||||
|
@ -5271,6 +5274,9 @@ class ConductorManager(service.PeriodicService):
|
|||
except Exception as e:
|
||||
LOG.error("Failed to upgrade/downgrade kubernetes "
|
||||
"networking images: {}".format(e))
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def check_nodes_stable(self):
|
||||
hosts = self.dbapi.ihost_get_list()
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
# -*- encoding: utf-8 -*-
|
||||
#
|
||||
#
|
||||
# Copyright (c) 2017-2018 Wind River Systems, Inc.
|
||||
# Copyright (c) 2017-2019 Wind River Systems, Inc.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
|
@ -546,6 +546,9 @@ class StorageTierDependentTCs(base.FunctionalTest):
|
|||
set_monitors_status_patcher = mock.patch.object(ceph_utils.CephApiOperator, 'get_monitors_status')
|
||||
set_is_initial_config_patcher = mock.patch.object(cutils, 'is_initial_config_complete')
|
||||
|
||||
upgrade_downgrade_kube_components_patcher = mock.patch.object(
|
||||
manager.ConductorManager, '_upgrade_downgrade_kube_components')
|
||||
|
||||
def setUp(self):
|
||||
super(StorageTierDependentTCs, self).setUp()
|
||||
self.mock_set_crushmap = self.set_crushmap_patcher.start()
|
||||
|
@ -563,11 +566,14 @@ class StorageTierDependentTCs(base.FunctionalTest):
|
|||
self.host_index = -1
|
||||
self.mon_index = -1
|
||||
|
||||
self.mock_upgrade_downgrade_kube_components = self.upgrade_downgrade_kube_components_patcher.start()
|
||||
|
||||
def tearDown(self):
|
||||
super(StorageTierDependentTCs, self).tearDown()
|
||||
self.set_crushmap_patcher.stop()
|
||||
self.set_monitors_status_patcher = self.set_monitors_status_patcher.stop()
|
||||
self.set_is_initial_config_patcher.stop()
|
||||
self.upgrade_downgrade_kube_components_patcher.stop()
|
||||
|
||||
def assertDeleted(self, fullPath):
|
||||
self.get_json(fullPath, expect_errors=True) # Make sure this line raises an error
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
# vim: tabstop=4 shiftwidth=4 softtabstop=4
|
||||
# coding=utf-8
|
||||
|
||||
# Copyright (c) 2017-2018 Wind River Systems, Inc.
|
||||
# Copyright (c) 2017-2019 Wind River Systems, Inc.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
|
@ -40,6 +40,9 @@ class UpdateCephCluster(base.DbTestCase):
|
|||
# - test_add_valid_mix_tiers
|
||||
# - test_add_4_mix_bbbb
|
||||
|
||||
upgrade_downgrade_kube_components_patcher = mock.patch.object(
|
||||
manager.ConductorManager, '_upgrade_downgrade_kube_components')
|
||||
|
||||
def setUp(self):
|
||||
super(UpdateCephCluster, self).setUp()
|
||||
self.service = manager.ConductorManager('test-host', 'test-topic')
|
||||
|
@ -50,6 +53,12 @@ class UpdateCephCluster(base.DbTestCase):
|
|||
self.load = utils.create_test_load()
|
||||
self.host_index = -1
|
||||
|
||||
self.mock_upgrade_downgrade_kube_components = self.upgrade_downgrade_kube_components_patcher.start()
|
||||
|
||||
def tearDown(self):
|
||||
super(UpdateCephCluster, self).tearDown()
|
||||
self.upgrade_downgrade_kube_components_patcher.stop()
|
||||
|
||||
def _create_storage_ihost(self, hostname):
|
||||
self.host_index += 1
|
||||
ihost_dict = utils.get_test_ihost(
|
||||
|
|
|
@ -17,11 +17,13 @@
|
|||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
#
|
||||
# Copyright (c) 2013-2016 Wind River Systems, Inc.
|
||||
# Copyright (c) 2013-2019 Wind River Systems, Inc.
|
||||
#
|
||||
|
||||
"""Test class for Sysinv ManagerService."""
|
||||
|
||||
import mock
|
||||
|
||||
from sysinv.common import exception
|
||||
from sysinv.conductor import manager
|
||||
from sysinv.db import api as dbapi
|
||||
|
@ -32,6 +34,9 @@ from sysinv.tests.db import utils
|
|||
|
||||
class ManagerTestCase(base.DbTestCase):
|
||||
|
||||
upgrade_downgrade_kube_components_patcher = mock.patch.object(
|
||||
manager.ConductorManager, '_upgrade_downgrade_kube_components')
|
||||
|
||||
def setUp(self):
|
||||
super(ManagerTestCase, self).setUp()
|
||||
self.service = manager.ConductorManager('test-host', 'test-topic')
|
||||
|
@ -41,6 +46,12 @@ class ManagerTestCase(base.DbTestCase):
|
|||
self.system = utils.create_test_isystem()
|
||||
self.load = utils.create_test_load()
|
||||
|
||||
self.mock_upgrade_downgrade_kube_components = self.upgrade_downgrade_kube_components_patcher.start()
|
||||
|
||||
def tearDown(self):
|
||||
super(ManagerTestCase, self).tearDown()
|
||||
self.upgrade_downgrade_kube_components_patcher.stop()
|
||||
|
||||
def _create_test_ihost(self, **kwargs):
|
||||
# ensure the system ID for proper association
|
||||
kwargs['forisystemid'] = self.system['id']
|
||||
|
|
Loading…
Reference in New Issue