Audit application status upon sysinv-conductor startup

If sysinv conductor process is abruptly terminated due to power
loss, OOM, missed audit cycle, etc... while an application upload/
apply/update/remove is in progres; the status of the application
is stuck in uploading/applying/updating/removing preventing any
subsequent system applications to resume (or revert) the operation.

This is the first commit of multi-commit change to improve the
robustness of the application framework.

Partial-Bug: 1833323
Change-Id: I73129d5621c77b50c2e29c078e6b99089244129f
Signed-off-by: Tee Ngo <Tee.Ngo@windriver.com>
This commit is contained in:
Tee Ngo 2019-07-10 16:03:25 -04:00
parent 2c04825fc9
commit e4feb091aa
2 changed files with 95 additions and 8 deletions

View File

@ -31,6 +31,7 @@ class KubeOperator(object):
self._dbapi = dbapi
self._kube_client_batch = None
self._kube_client_core = None
self._kube_client_custom_objects = None
def _load_kube_config(self):
config.load_kube_config('/etc/kubernetes/admin.conf')
@ -52,6 +53,12 @@ class KubeOperator(object):
self._kube_client_core = client.CoreV1Api()
return self._kube_client_core
def _get_kubernetesclient_custom_objects(self):
if not self._kube_client_custom_objects:
self._load_kube_config()
self._kube_client_custom_objects = client.CustomObjectsApi()
return self._kube_client_custom_objects
def kube_patch_node(self, name, body):
try:
api_response = self._get_kubernetesclient_core().patch_node(name, body)
@ -264,3 +271,18 @@ class KubeOperator(object):
LOG.error("Failed to delete Jobs with label %s under "
"Namespace %s: %s" % (label, namespace, e))
raise
def delete_custom_resource(self, group, version, namespace, plural, name):
c = self._get_kubernetesclient_custom_objects()
body = {}
try:
c.delete_namespaced_custom_object(group, version, namespace,
plural, name, body)
except ApiException as ex:
if ex.reason == "Not Found":
pass
except Exception as e:
LOG.error("Failed to delete custom object, Namespace %s: %s"
% (namespace, e))
raise

View File

@ -70,6 +70,11 @@ DOCKER_REGISTRY_SECRET = 'default-registry-key'
ARMADA_HOST_LOG_LOCATION = '/var/log/armada'
ARMADA_CONTAINER_LOG_LOCATION = '/logs'
ARMADA_LOCK_GROUP = 'armada.process'
ARMADA_LOCK_VERSION = 'v1'
ARMADA_LOCK_NAMESPACE = 'kube-system'
ARMADA_LOCK_PLURAL = 'locks'
ARMADA_LOCK_NAME = 'lock'
# Helper functions
@ -141,9 +146,69 @@ class AppOperator(object):
self._docker = DockerHelper(self._dbapi)
self._helm = helm.HelmOperator(self._dbapi)
self._kube = kubernetes.KubeOperator(self._dbapi)
self._app = kube_app.KubeAppHelper(self._dbapi)
self._utils = kube_app.KubeAppHelper(self._dbapi)
self._lock = threading.Lock()
if not os.path.isfile(constants.ANSIBLE_BOOTSTRAP_FLAG):
self._clear_stuck_applications()
def _clear_armada_locks(self):
lock_name = "{}.{}.{}".format(ARMADA_LOCK_PLURAL,
ARMADA_LOCK_GROUP,
ARMADA_LOCK_NAME)
try:
self._kube.delete_custom_resource(ARMADA_LOCK_GROUP,
ARMADA_LOCK_VERSION,
ARMADA_LOCK_NAMESPACE,
ARMADA_LOCK_PLURAL,
lock_name)
except Exception:
# Best effort delete
pass
def _clear_stuck_applications(self):
apps = self._dbapi.kube_app_get_all()
for app in apps:
if (app.status == constants.APP_APPLY_IN_PROGRESS or
app.status == constants.APP_UPDATE_IN_PROGRESS or
app.status == constants.APP_RECOVER_IN_PROGRESS):
if app.status == constants.APP_APPLY_IN_PROGRESS:
op = 'application-apply'
else:
op = 'application-update'
if app.name in constants.HELM_APPS_PLATFORM_MANAGED:
# For platform core apps, set the new status
# to 'uploaded'. The audit task will kick in with
# all its pre-requisite checks before reapplying.
new_status = constants.APP_UPLOAD_SUCCESS
else:
new_status = constants.APP_APPLY_FAILURE
elif app.status == constants.APP_REMOVE_IN_PROGRESS:
op = 'application-remove'
new_status = constants.APP_REMOVE_FAILURE
elif app.status == constants.APP_UPLOAD_IN_PROGRESS:
op = 'application-upload'
new_status = constants.APP_UPLOAD_FAILURE
else:
continue
LOG.info("Resetting status of app %s from '%s' to '%s' " %
(app.name, app.status, new_status))
error_msg = "Unexpected process termination while " + op +\
" was in progress. The application status " +\
"has changed from \'" + app.status +\
"\' to \'" + new_status + "\'."
values = {'progress': error_msg, 'status': new_status}
self._dbapi.kube_app_update(app.id, values)
# Delete the Armada locks that might have been acquired previously
# for a fresh start. This guarantees that a re-apply, re-update or
# a re-remove attempt following a status reset will not fail due
# to a lock related issue.
self._clear_armada_locks()
def _cleanup(self, app, app_dir=True):
"""" Remove application directories and override files """
try:
@ -281,7 +346,7 @@ class AppOperator(object):
_handle_extract_failure()
if app.downloaded_tarfile:
name, version, patches = self._app._verify_metadata_file(
name, version, patches = self._utils._verify_metadata_file(
app.path, app.name, app.version)
if (name != app.name or version != app.version):
# Save the official application info. They will be
@ -290,7 +355,7 @@ class AppOperator(object):
if not cutils.verify_checksum(app.path):
_handle_extract_failure('checksum validation failed.')
mname, mfile = self._app._find_manifest_file(app.path)
mname, mfile = self._utils._find_manifest_file(app.path)
# Save the official manifest file info. They will be persisted
# in the next status update
app.regenerate_manifest_filename(mname, os.path.basename(mfile))
@ -299,7 +364,7 @@ class AppOperator(object):
app.path, constants.APP_METADATA_FILE)
app.patch_dependencies = patches
self._app._extract_helm_charts(app.path)
self._utils._extract_helm_charts(app.path)
except exception.SysinvException as e:
_handle_extract_failure(str(e))
@ -1252,7 +1317,7 @@ class AppOperator(object):
try:
self._cleanup(new_app, app_dir=False)
self._app._patch_report_app_dependencies(
self._utils._patch_report_app_dependencies(
new_app.name + '-' + new_app.version)
self._dbapi.kube_app_destroy(new_app.name,
version=new_app.version,
@ -1443,7 +1508,7 @@ class AppOperator(object):
self._save_images_list(app)
if app.patch_dependencies:
self._app._patch_report_app_dependencies(
self._utils._patch_report_app_dependencies(
app.name + '-' + app.version, app.patch_dependencies)
self._create_app_releases_version(app.name, app.charts)
self._update_app_status(app, constants.APP_UPLOAD_SUCCESS,
@ -1615,7 +1680,7 @@ class AppOperator(object):
% (from_chart.release, from_app.name, from_app.version))
self._cleanup(from_app, app_dir=False)
self._app._patch_report_app_dependencies(
self._utils._patch_report_app_dependencies(
from_app.name + '-' + from_app.version)
self._update_app_status(
@ -1739,7 +1804,7 @@ class AppOperator(object):
try:
self._dbapi.kube_app_destroy(app.name)
self._cleanup(app)
self._app._patch_report_app_dependencies(app.name + '-' + app.version)
self._utils._patch_report_app_dependencies(app.name + '-' + app.version)
LOG.info("Application (%s) has been purged from the system." %
app.name)
msg = None