From 029e3eecf50d5000142c8f3ef6b203437e313c07 Mon Sep 17 00:00:00 2001 From: Sachin Gopala Krishna Date: Mon, 12 Dec 2022 10:55:09 -0500 Subject: [PATCH] Add system command and periodic audit to transition state system kube-upgrade-* commands can get stuck in upgrading-* state with no way to continue to upgrade. The 'system kube-upgrade-failed' command created to manually set state to *-failed. Created kube-upgrade-failed command to manually set status to *-failed. Created 30 minute periodic task _audit_kube_upgrade_states to automatically change the kube_upograde state to *-failed if the specific state is stuck 'upgrading-*' for more than 1 hour. Updarted kube_upgrade_controller to support state transition to *-failed state. Test Plan: PASS: Manually edit kube_upgrade state to upgrading-* and execute 'system kube-upgrade-failed' and verify the state transition to *-failed PASS: Manually edit kube_upgrade state to upgrading-* after kube_upgrade completion and wait for one hour and verify state transition to *-failed based on updated_at time stamp PASS: Verify the functionality of _audit_kube_upgrade_states and kube-upgrade-failed by building ISO Closes-Bug: 1999405 Signed-off-by: Sachin Gopala Krishna Change-Id: I499fb2909f11dc2b240dbf2e03ccfd95f1fd2e62 --- .../cgtsclient/v1/kube_upgrade_shell.py | 33 +++++++++++ .../sysinv/api/controllers/v1/kube_upgrade.py | 17 +++++- .../sysinv/sysinv/sysinv/conductor/manager.py | 31 ++++++++++ .../sysinv/tests/api/test_kube_upgrade.py | 59 +++++++++++++++++++ 4 files changed, 139 insertions(+), 1 deletion(-) diff --git a/sysinv/cgts-client/cgts-client/cgtsclient/v1/kube_upgrade_shell.py b/sysinv/cgts-client/cgts-client/cgtsclient/v1/kube_upgrade_shell.py index 06bd93b5b9..e7beb4a325 100755 --- a/sysinv/cgts-client/cgts-client/cgtsclient/v1/kube_upgrade_shell.py +++ b/sysinv/cgts-client/cgts-client/cgtsclient/v1/kube_upgrade_shell.py @@ -11,6 +11,8 @@ from cgtsclient import exc KUBE_UPGRADE_STATE_DOWNLOADING_IMAGES = 'downloading-images' KUBE_UPGRADE_STATE_UPGRADING_NETWORKING = 'upgrading-networking' KUBE_UPGRADE_STATE_COMPLETE = 'upgrade-complete' +KUBE_UPGRADE_STATE_UPGRADING_FIRST_MASTER = 'upgrading-first-master' +KUBE_UPGRADE_STATE_UPGRADING_SECOND_MASTER = 'upgrading-second-master' def _print_kube_upgrade_show(obj): @@ -109,3 +111,34 @@ def do_kube_upgrade_delete(cc, args): raise exc.CommandError('Kubernetes upgrade not found') print("Kubernetes upgrade deleted") + + +def do_kube_upgrade_failed(cc, args): + """Set kubernetes upgrade status to *-failed""" + + kube_upgrade_state_map = { + KUBE_UPGRADE_STATE_DOWNLOADING_IMAGES: "downloading-images-failed", + KUBE_UPGRADE_STATE_UPGRADING_NETWORKING: "upgrading-networking-failed", + KUBE_UPGRADE_STATE_UPGRADING_FIRST_MASTER: "upgrading-first-master-failed", + KUBE_UPGRADE_STATE_UPGRADING_SECOND_MASTER: "upgrading-second-master-failed" + } + + kube_upgrades = cc.kube_upgrade.list() + if kube_upgrades: + current_state = getattr(kube_upgrades[0], 'state', '') + if kube_upgrade_state_map.get(current_state): + data = dict() + data['state'] = kube_upgrade_state_map.get(current_state) + patch = [] + for (k, v) in data.items(): + patch.append({'op': 'replace', 'path': '/' + k, 'value': v}) + + try: + kube_upgrade = cc.kube_upgrade.update(patch) + except exc.HTTPNotFound: + raise exc.CommandError('Kubernetes upgrade not found') + _print_kube_upgrade_show(kube_upgrade) + else: + print('Kubernetes upgrade is in %s state, cannot be set to failed' % current_state) + else: + print('A kubernetes upgrade is not in progress') diff --git a/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/kube_upgrade.py b/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/kube_upgrade.py index 2237051cf1..ae28d91b20 100755 --- a/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/kube_upgrade.py +++ b/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/kube_upgrade.py @@ -325,7 +325,22 @@ class KubeUpgradeController(rest.RestController): raise wsme.exc.ClientSideError(_( "A kubernetes upgrade is not in progress")) - if updates['state'] == kubernetes.KUBE_UPGRADE_DOWNLOADING_IMAGES: + if updates['state'] and updates['state'].split('-')[-1] == 'failed': + if kube_upgrade_obj.state in [ + kubernetes.KUBE_UPGRADE_DOWNLOADING_IMAGES, + kubernetes.KUBE_UPGRADING_FIRST_MASTER, + kubernetes.KUBE_UPGRADING_SECOND_MASTER, + kubernetes.KUBE_UPGRADING_NETWORKING]: + kube_upgrade_obj.state = updates['state'] + kube_upgrade_obj.save() + LOG.info("Kubernetes upgrade state is changed to %s" % updates['state']) + return KubeUpgrade.convert_with_links(kube_upgrade_obj) + else: + raise wsme.exc.ClientSideError(_( + "A kubernetes upgrade is in %s state cannot be set to failed" + % kube_upgrade_obj.state)) + + elif updates['state'] == kubernetes.KUBE_UPGRADE_DOWNLOADING_IMAGES: # Make sure upgrade is in the correct state to download images if kube_upgrade_obj.state not in [ kubernetes.KUBE_UPGRADE_STARTED, diff --git a/sysinv/sysinv/sysinv/sysinv/conductor/manager.py b/sysinv/sysinv/sysinv/sysinv/conductor/manager.py index a9ca7e50b6..ca4859f9e2 100644 --- a/sysinv/sysinv/sysinv/sysinv/conductor/manager.py +++ b/sysinv/sysinv/sysinv/sysinv/conductor/manager.py @@ -49,6 +49,7 @@ import uuid import xml.etree.ElementTree as ElementTree from contextlib import contextmanager from datetime import datetime +from datetime import timedelta from distutils.util import strtobool from copy import deepcopy @@ -168,6 +169,7 @@ audit_intervals_opts = [ cfg.IntOpt('storage_backend_failure', default=400), cfg.IntOpt('k8s_application', default=60), cfg.IntOpt('device_image_update', default=300), + cfg.IntOpt('kube_upgrade_states', default=1800), ] CONF = cfg.CONF @@ -16442,6 +16444,35 @@ class ConductorManager(service.PeriodicService): LOG.info('Successfully updated %s feature-gates service param.' % section) return 0 + @periodic_task.periodic_task(spacing=CONF.conductor_periodic_task_intervals.kube_upgrade_states) + def _audit_kube_upgrade_states(self, context): + # A Kubernetes upgrade state can be stuck in upgrading-* state. + # To avoid this situation we audit the sanity of the states, + # after 2 audit cycles if the states are not changed then set + # the kube_state to *-failed. + + kube_upgrade_state_map = dict() + kube_upgrade_state_map["downloading-images"] = "downloading-images-failed" + kube_upgrade_state_map["upgrading-networking"] = "upgrading-networking-failed" + kube_upgrade_state_map["upgrading-first-master"] = "upgrading-first-master-failed" + kube_upgrade_state_map["upgrading-second-master"] = "upgrading-second-master-failed" + + try: + kube_upgrade = self.dbapi.kube_upgrade_get_one() + current_state = getattr(kube_upgrade, 'state', '') + if kube_upgrade_state_map.get(current_state): + kube_upgrade_time_stamp = getattr(kube_upgrade, 'updated_at') + if datetime.utcnow() - kube_upgrade_time_stamp >= timedelta( + seconds=CONF.conductor_periodic_task_intervals.kube_upgrade_states * 2): + self.dbapi.kube_upgrade_update(kube_upgrade.uuid, + {'state': kube_upgrade_state_map[current_state]}) + LOG.info( + "Kube_upgrade state changed from " + "'%s' to '%s'", current_state, + kube_upgrade_state_map[current_state]) + except exception.NotFound: + LOG.debug("A kubernetes upgrade is not in progress") + def device_image_state_sort_key(dev_img_state): if dev_img_state.bitstream_type == dconstants.BITSTREAM_TYPE_ROOT_KEY: diff --git a/sysinv/sysinv/sysinv/sysinv/tests/api/test_kube_upgrade.py b/sysinv/sysinv/sysinv/sysinv/tests/api/test_kube_upgrade.py index fc0b111a7e..3c714d130c 100644 --- a/sysinv/sysinv/sysinv/sysinv/tests/api/test_kube_upgrade.py +++ b/sysinv/sysinv/sysinv/sysinv/tests/api/test_kube_upgrade.py @@ -843,6 +843,65 @@ class TestPatch(TestKubeUpgrade, self.assertIn("Invalid state", result.json['error_message']) + def test_update_failed_state(self): + # Test updating the state of an upgrade with a failed state + + # Create the upgrade + kube_upgrade = dbutils.create_test_kube_upgrade( + from_version='v1.43.1', + to_version='v1.43.2', + state=kubernetes.KUBE_UPGRADE_DOWNLOADING_IMAGES) + uuid = kube_upgrade.uuid + + # Update state + new_state = kubernetes.KUBE_UPGRADE_DOWNLOADING_IMAGES_FAILED + result = self.patch_json('/kube_upgrade', + [{'path': '/state', + 'value': new_state, + 'op': 'replace'}], + headers={'User-Agent': 'sysinv-test'}, + expect_errors=True) + + # Verify the failure + self.assertEqual(result.content_type, 'application/json') + self.assertEqual(result.status_code, http_client.OK) + self.assertEqual(result.json['state'], new_state) + + # see if state was changed in DB + kube_cmd_version = self.dbapi.kube_upgrade_get_one() + self.assertEqual(kube_cmd_version.state, new_state) + + # Verify that the upgrade was updated with the new state + result = self.get_json('/kube_upgrade/%s' % uuid) + self.assertEqual(result['from_version'], 'v1.43.1') + self.assertEqual(result['to_version'], 'v1.43.2') + self.assertEqual(result['state'], new_state) + + def test_update_state_failed_invalid_state(self): + # Test updating the invalid state of an upgrade with a failed state + + # Create the upgrade + dbutils.create_test_kube_upgrade( + from_version='v1.43.1', + to_version='v1.43.2', + state=kubernetes.KUBE_UPGRADE_DOWNLOADED_IMAGES) + + # Update state + new_state = kubernetes.KUBE_UPGRADE_DOWNLOADING_IMAGES_FAILED + result = self.patch_json('/kube_upgrade', + [{'path': '/state', + 'value': new_state, + 'op': 'replace'}], + headers={'User-Agent': 'sysinv-test'}, + expect_errors=True) + + # Verify the failure + self.assertEqual(result.content_type, 'application/json') + self.assertEqual(result.status_code, http_client.BAD_REQUEST) + self.assertIn(("A kubernetes upgrade is in downloaded-images state " + "cannot be set to failed"), + result.json['error_message']) + class TestDelete(TestKubeUpgrade):