Add system command and periodic audit to transition state

system kube-upgrade-* commands can get stuck in upgrading-* state with
no way to continue to upgrade. The 'system kube-upgrade-failed' command
created to manually set state to *-failed.

Created kube-upgrade-failed command to manually set status to *-failed.
Created 30 minute periodic task _audit_kube_upgrade_states to
automatically change the kube_upograde state to *-failed if the specific
state is stuck 'upgrading-*' for more than 1 hour.
Updarted kube_upgrade_controller to support state transition to *-failed
state.

Test Plan:
PASS: Manually edit kube_upgrade state to upgrading-* and execute
'system kube-upgrade-failed' and verify the state transition to *-failed
PASS: Manually edit kube_upgrade state to upgrading-* after kube_upgrade
completion and wait for one hour and verify state transition to *-failed
based on updated_at time stamp
PASS: Verify the functionality of _audit_kube_upgrade_states and
kube-upgrade-failed by building ISO

Closes-Bug: 1999405

Signed-off-by: Sachin Gopala Krishna <saching.krishna@windriver.com>
Change-Id: I499fb2909f11dc2b240dbf2e03ccfd95f1fd2e62
This commit is contained in:
Sachin Gopala Krishna 2022-12-12 10:55:09 -05:00
parent 5036eb895a
commit 029e3eecf5
4 changed files with 139 additions and 1 deletions

View File

@ -11,6 +11,8 @@ from cgtsclient import exc
KUBE_UPGRADE_STATE_DOWNLOADING_IMAGES = 'downloading-images' KUBE_UPGRADE_STATE_DOWNLOADING_IMAGES = 'downloading-images'
KUBE_UPGRADE_STATE_UPGRADING_NETWORKING = 'upgrading-networking' KUBE_UPGRADE_STATE_UPGRADING_NETWORKING = 'upgrading-networking'
KUBE_UPGRADE_STATE_COMPLETE = 'upgrade-complete' KUBE_UPGRADE_STATE_COMPLETE = 'upgrade-complete'
KUBE_UPGRADE_STATE_UPGRADING_FIRST_MASTER = 'upgrading-first-master'
KUBE_UPGRADE_STATE_UPGRADING_SECOND_MASTER = 'upgrading-second-master'
def _print_kube_upgrade_show(obj): def _print_kube_upgrade_show(obj):
@ -109,3 +111,34 @@ def do_kube_upgrade_delete(cc, args):
raise exc.CommandError('Kubernetes upgrade not found') raise exc.CommandError('Kubernetes upgrade not found')
print("Kubernetes upgrade deleted") print("Kubernetes upgrade deleted")
def do_kube_upgrade_failed(cc, args):
"""Set kubernetes upgrade status to *-failed"""
kube_upgrade_state_map = {
KUBE_UPGRADE_STATE_DOWNLOADING_IMAGES: "downloading-images-failed",
KUBE_UPGRADE_STATE_UPGRADING_NETWORKING: "upgrading-networking-failed",
KUBE_UPGRADE_STATE_UPGRADING_FIRST_MASTER: "upgrading-first-master-failed",
KUBE_UPGRADE_STATE_UPGRADING_SECOND_MASTER: "upgrading-second-master-failed"
}
kube_upgrades = cc.kube_upgrade.list()
if kube_upgrades:
current_state = getattr(kube_upgrades[0], 'state', '')
if kube_upgrade_state_map.get(current_state):
data = dict()
data['state'] = kube_upgrade_state_map.get(current_state)
patch = []
for (k, v) in data.items():
patch.append({'op': 'replace', 'path': '/' + k, 'value': v})
try:
kube_upgrade = cc.kube_upgrade.update(patch)
except exc.HTTPNotFound:
raise exc.CommandError('Kubernetes upgrade not found')
_print_kube_upgrade_show(kube_upgrade)
else:
print('Kubernetes upgrade is in %s state, cannot be set to failed' % current_state)
else:
print('A kubernetes upgrade is not in progress')

View File

@ -325,7 +325,22 @@ class KubeUpgradeController(rest.RestController):
raise wsme.exc.ClientSideError(_( raise wsme.exc.ClientSideError(_(
"A kubernetes upgrade is not in progress")) "A kubernetes upgrade is not in progress"))
if updates['state'] == kubernetes.KUBE_UPGRADE_DOWNLOADING_IMAGES: if updates['state'] and updates['state'].split('-')[-1] == 'failed':
if kube_upgrade_obj.state in [
kubernetes.KUBE_UPGRADE_DOWNLOADING_IMAGES,
kubernetes.KUBE_UPGRADING_FIRST_MASTER,
kubernetes.KUBE_UPGRADING_SECOND_MASTER,
kubernetes.KUBE_UPGRADING_NETWORKING]:
kube_upgrade_obj.state = updates['state']
kube_upgrade_obj.save()
LOG.info("Kubernetes upgrade state is changed to %s" % updates['state'])
return KubeUpgrade.convert_with_links(kube_upgrade_obj)
else:
raise wsme.exc.ClientSideError(_(
"A kubernetes upgrade is in %s state cannot be set to failed"
% kube_upgrade_obj.state))
elif updates['state'] == kubernetes.KUBE_UPGRADE_DOWNLOADING_IMAGES:
# Make sure upgrade is in the correct state to download images # Make sure upgrade is in the correct state to download images
if kube_upgrade_obj.state not in [ if kube_upgrade_obj.state not in [
kubernetes.KUBE_UPGRADE_STARTED, kubernetes.KUBE_UPGRADE_STARTED,

View File

@ -49,6 +49,7 @@ import uuid
import xml.etree.ElementTree as ElementTree import xml.etree.ElementTree as ElementTree
from contextlib import contextmanager from contextlib import contextmanager
from datetime import datetime from datetime import datetime
from datetime import timedelta
from distutils.util import strtobool from distutils.util import strtobool
from copy import deepcopy from copy import deepcopy
@ -168,6 +169,7 @@ audit_intervals_opts = [
cfg.IntOpt('storage_backend_failure', default=400), cfg.IntOpt('storage_backend_failure', default=400),
cfg.IntOpt('k8s_application', default=60), cfg.IntOpt('k8s_application', default=60),
cfg.IntOpt('device_image_update', default=300), cfg.IntOpt('device_image_update', default=300),
cfg.IntOpt('kube_upgrade_states', default=1800),
] ]
CONF = cfg.CONF CONF = cfg.CONF
@ -16442,6 +16444,35 @@ class ConductorManager(service.PeriodicService):
LOG.info('Successfully updated %s feature-gates service param.' % section) LOG.info('Successfully updated %s feature-gates service param.' % section)
return 0 return 0
@periodic_task.periodic_task(spacing=CONF.conductor_periodic_task_intervals.kube_upgrade_states)
def _audit_kube_upgrade_states(self, context):
# A Kubernetes upgrade state can be stuck in upgrading-* state.
# To avoid this situation we audit the sanity of the states,
# after 2 audit cycles if the states are not changed then set
# the kube_state to *-failed.
kube_upgrade_state_map = dict()
kube_upgrade_state_map["downloading-images"] = "downloading-images-failed"
kube_upgrade_state_map["upgrading-networking"] = "upgrading-networking-failed"
kube_upgrade_state_map["upgrading-first-master"] = "upgrading-first-master-failed"
kube_upgrade_state_map["upgrading-second-master"] = "upgrading-second-master-failed"
try:
kube_upgrade = self.dbapi.kube_upgrade_get_one()
current_state = getattr(kube_upgrade, 'state', '')
if kube_upgrade_state_map.get(current_state):
kube_upgrade_time_stamp = getattr(kube_upgrade, 'updated_at')
if datetime.utcnow() - kube_upgrade_time_stamp >= timedelta(
seconds=CONF.conductor_periodic_task_intervals.kube_upgrade_states * 2):
self.dbapi.kube_upgrade_update(kube_upgrade.uuid,
{'state': kube_upgrade_state_map[current_state]})
LOG.info(
"Kube_upgrade state changed from "
"'%s' to '%s'", current_state,
kube_upgrade_state_map[current_state])
except exception.NotFound:
LOG.debug("A kubernetes upgrade is not in progress")
def device_image_state_sort_key(dev_img_state): def device_image_state_sort_key(dev_img_state):
if dev_img_state.bitstream_type == dconstants.BITSTREAM_TYPE_ROOT_KEY: if dev_img_state.bitstream_type == dconstants.BITSTREAM_TYPE_ROOT_KEY:

View File

@ -843,6 +843,65 @@ class TestPatch(TestKubeUpgrade,
self.assertIn("Invalid state", self.assertIn("Invalid state",
result.json['error_message']) result.json['error_message'])
def test_update_failed_state(self):
# Test updating the state of an upgrade with a failed state
# Create the upgrade
kube_upgrade = dbutils.create_test_kube_upgrade(
from_version='v1.43.1',
to_version='v1.43.2',
state=kubernetes.KUBE_UPGRADE_DOWNLOADING_IMAGES)
uuid = kube_upgrade.uuid
# Update state
new_state = kubernetes.KUBE_UPGRADE_DOWNLOADING_IMAGES_FAILED
result = self.patch_json('/kube_upgrade',
[{'path': '/state',
'value': new_state,
'op': 'replace'}],
headers={'User-Agent': 'sysinv-test'},
expect_errors=True)
# Verify the failure
self.assertEqual(result.content_type, 'application/json')
self.assertEqual(result.status_code, http_client.OK)
self.assertEqual(result.json['state'], new_state)
# see if state was changed in DB
kube_cmd_version = self.dbapi.kube_upgrade_get_one()
self.assertEqual(kube_cmd_version.state, new_state)
# Verify that the upgrade was updated with the new state
result = self.get_json('/kube_upgrade/%s' % uuid)
self.assertEqual(result['from_version'], 'v1.43.1')
self.assertEqual(result['to_version'], 'v1.43.2')
self.assertEqual(result['state'], new_state)
def test_update_state_failed_invalid_state(self):
# Test updating the invalid state of an upgrade with a failed state
# Create the upgrade
dbutils.create_test_kube_upgrade(
from_version='v1.43.1',
to_version='v1.43.2',
state=kubernetes.KUBE_UPGRADE_DOWNLOADED_IMAGES)
# Update state
new_state = kubernetes.KUBE_UPGRADE_DOWNLOADING_IMAGES_FAILED
result = self.patch_json('/kube_upgrade',
[{'path': '/state',
'value': new_state,
'op': 'replace'}],
headers={'User-Agent': 'sysinv-test'},
expect_errors=True)
# Verify the failure
self.assertEqual(result.content_type, 'application/json')
self.assertEqual(result.status_code, http_client.BAD_REQUEST)
self.assertIn(("A kubernetes upgrade is in downloaded-images state "
"cannot be set to failed"),
result.json['error_message'])
class TestDelete(TestKubeUpgrade): class TestDelete(TestKubeUpgrade):