Trigger application reapply on host unlock/delete
- Triggers an application reapply (including override regeneration) on node unlock to pick up a new node being added or configuration changes being made. - The reapply also triggers on node delete if the host had a compute node label to remove any per-host overrides. - Turned on the restriction that nodes must be locked to modify labels. Added an audit task to sync any labels made before a node is unlocked and missing from k8s. - Modified k8s puppet manifest to only launch kubelet on the initial configuration, after that the service file is modified to have a dependancy on the config gate. This is to avoid PLEG errors in kubernetes due to the node being overwhelmed during boot. Change-Id: I1d9ca92f451aa322765da43ffcbb1d95f97f92f2 Story: 2004520 Task: 28826 Signed-off-by: Tyler Smith <tyler.smith@windriver.com>
This commit is contained in:
parent
3f6e511ec3
commit
5624c74062
|
@ -39,7 +39,6 @@ class platform::kubernetes::kubeadm {
|
|||
}
|
||||
# Start kubelet.
|
||||
-> service { 'kubelet':
|
||||
ensure => 'running',
|
||||
enable => true,
|
||||
}
|
||||
# A seperate enable is required since we have modified the service resource
|
||||
|
@ -124,6 +123,21 @@ class platform::kubernetes::master::init
|
|||
command => "kubectl --kubeconfig=/etc/kubernetes/admin.conf taint node ${::platform::params::hostname} node-role.kubernetes.io/master-", # lint:ignore:140chars
|
||||
logoutput => true,
|
||||
}
|
||||
|
||||
# Add a dependency to kubelet on config so it doesn't enter a bad state on subsequent boots
|
||||
-> file { '/etc/systemd/system/kubelet.service.d/kube-stx-override.conf':
|
||||
ensure => file,
|
||||
content => template('platform/kube-stx-override.conf.erb'),
|
||||
owner => 'root',
|
||||
group => 'root',
|
||||
mode => '0644',
|
||||
}
|
||||
|
||||
# Reload systemd
|
||||
-> exec { 'perform systemctl daemon reload for kubelet override':
|
||||
command => 'systemctl daemon-reload',
|
||||
logoutput => true,
|
||||
}
|
||||
} else {
|
||||
if str2bool($::is_initial_config) {
|
||||
# For subsequent controller installs, install kubernetes using the
|
||||
|
@ -206,6 +220,21 @@ class platform::kubernetes::master::init
|
|||
command => "kubectl --kubeconfig=/etc/kubernetes/admin.conf taint node ${::platform::params::hostname} node-role.kubernetes.io/master-", # lint:ignore:140chars
|
||||
logoutput => true,
|
||||
}
|
||||
|
||||
# Add a dependency to kubelet on config so it doesn't enter a bad state on subsequent boots
|
||||
-> file { '/etc/systemd/system/kubelet.service.d/kube-stx-override.conf':
|
||||
ensure => file,
|
||||
content => template('platform/kube-stx-override.conf.erb'),
|
||||
owner => 'root',
|
||||
group => 'root',
|
||||
mode => '0644',
|
||||
}
|
||||
|
||||
# Reload systemd
|
||||
-> exec { 'perform systemctl daemon reload for kubelet override':
|
||||
command => 'systemctl daemon-reload',
|
||||
logoutput => true,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -242,6 +271,21 @@ class platform::kubernetes::worker::init
|
|||
logoutput => true,
|
||||
unless => 'test -f /etc/kubernetes/kubelet.conf',
|
||||
}
|
||||
|
||||
# Add a dependency to kubelet on config so it doesn't enter a bad state
|
||||
-> file { '/etc/systemd/system/kubelet.service.d/kube-stx-override.conf':
|
||||
ensure => file,
|
||||
content => template('platform/kube-stx-override.conf.erb'),
|
||||
owner => 'root',
|
||||
group => 'root',
|
||||
mode => '0644',
|
||||
}
|
||||
|
||||
# Reload systemd
|
||||
-> exec { 'perform systemctl daemon reload for kubelet override':
|
||||
command => 'systemctl daemon-reload',
|
||||
logoutput => true,
|
||||
}
|
||||
}
|
||||
|
||||
class platform::kubernetes::worker
|
||||
|
|
|
@ -0,0 +1,2 @@
|
|||
[Unit]
|
||||
After=config.service
|
|
@ -2110,6 +2110,11 @@ class HostController(rest.RestController):
|
|||
ihost_obj['uuid'],
|
||||
ibm_msg_dict)
|
||||
|
||||
# Trigger a system app reapply if the host has been unlocked
|
||||
if (utils.is_kubernetes_config() and patched_ihost.get('action') in
|
||||
[constants.UNLOCK_ACTION, constants.FORCE_UNLOCK_ACTION]):
|
||||
self._reapply_system_app()
|
||||
|
||||
elif mtc_response['status'] is None:
|
||||
raise wsme.exc.ClientSideError(
|
||||
_("Timeout waiting for maintenance response. "
|
||||
|
@ -2341,6 +2346,15 @@ class HostController(rest.RestController):
|
|||
# wait for VIM signal
|
||||
return
|
||||
|
||||
openstack_worker = False
|
||||
if utils.is_kubernetes_config():
|
||||
labels = objects.label.get_by_host_id(pecan.request.context, ihost.uuid)
|
||||
for l in labels:
|
||||
if (constants.COMPUTE_NODE_LABEL ==
|
||||
str(l.label_key) + '=' + str(l.label_value)):
|
||||
openstack_worker = True
|
||||
break
|
||||
|
||||
idict = {'operation': constants.DELETE_ACTION,
|
||||
'uuid': ihost.uuid,
|
||||
'invprovision': ihost.invprovision}
|
||||
|
@ -2464,6 +2478,32 @@ class HostController(rest.RestController):
|
|||
|
||||
pecan.request.dbapi.ihost_destroy(ihost_id)
|
||||
|
||||
# If the host being removed was an openstack worker node, trigger
|
||||
# a reapply
|
||||
if openstack_worker:
|
||||
self._reapply_system_app()
|
||||
|
||||
def _reapply_system_app(self):
|
||||
try:
|
||||
db_app = objects.kube_app.get_by_name(
|
||||
pecan.request.context, constants.HELM_APP_OPENSTACK)
|
||||
|
||||
if db_app.status == constants.APP_APPLY_SUCCESS:
|
||||
LOG.info(
|
||||
"Reapplying the %s app" % constants.HELM_APP_OPENSTACK)
|
||||
db_app.status = constants.APP_APPLY_IN_PROGRESS
|
||||
db_app.progress = None
|
||||
db_app.save()
|
||||
pecan.request.rpcapi.perform_app_apply(
|
||||
pecan.request.context, db_app)
|
||||
else:
|
||||
LOG.info("%s system app is present but not applied, "
|
||||
"skipping re-apply" % constants.HELM_APP_OPENSTACK)
|
||||
except exception.KubeAppNotFound:
|
||||
LOG.info(
|
||||
"%s system app not present, skipping re-apply" %
|
||||
constants.HELM_APP_OPENSTACK)
|
||||
|
||||
def _check_upgrade_provision_order(self, personality, hostname):
|
||||
LOG.info("_check_upgrade_provision_order personality=%s, hostname=%s" %
|
||||
(personality, hostname))
|
||||
|
|
|
@ -288,11 +288,5 @@ class LabelController(rest.RestController):
|
|||
# UTILS
|
||||
###########
|
||||
def _check_host_locked(host):
|
||||
|
||||
# TODO(ksmith):
|
||||
# turn this on later
|
||||
return
|
||||
|
||||
if (utils.is_aio_simplex_host_unlocked(host) or
|
||||
host.administrative != constants.ADMIN_LOCKED):
|
||||
if host.administrative != constants.ADMIN_LOCKED:
|
||||
raise wsme.exc.ClientSideError(_("Host must be locked."))
|
||||
|
|
|
@ -123,6 +123,7 @@ HOST_DELETE = 'host_delete' # for personality sub-type validation
|
|||
|
||||
# Availability
|
||||
AVAILABILITY_AVAILABLE = 'available'
|
||||
AVAILABILITY_INTEST = 'intest'
|
||||
AVAILABILITY_OFFLINE = 'offline'
|
||||
AVAILABILITY_ONLINE = 'online'
|
||||
AVAILABILITY_DEGRADED = 'degraded'
|
||||
|
|
|
@ -1170,6 +1170,10 @@ class HostLabelInvalid(Invalid):
|
|||
message = _("Host label is invalid. Reason: %(reason)s")
|
||||
|
||||
|
||||
class K8sNodeNotFound(NotFound):
|
||||
message = _("Kubernetes Node %(name)s could not be found.")
|
||||
|
||||
|
||||
class PickleableException(Exception):
|
||||
"""
|
||||
Pickleable Exception
|
||||
|
|
|
@ -51,6 +51,19 @@ class KubeOperator(object):
|
|||
if e.status == httplib.UNPROCESSABLE_ENTITY:
|
||||
reason = json.loads(e.body).get('message', "")
|
||||
raise exception.HostLabelInvalid(reason=reason)
|
||||
elif e.status == httplib.NOT_FOUND:
|
||||
raise exception.K8sNodeNotFound(name=name)
|
||||
else:
|
||||
raise
|
||||
except Exception as e:
|
||||
LOG.error("Kubernetes exception: %s" % e)
|
||||
LOG.error("Kubernetes exception in kube_patch_node: %s" % e)
|
||||
raise
|
||||
|
||||
def kube_get_nodes(self):
|
||||
try:
|
||||
api_response = self._get_kubernetesclient().list_node()
|
||||
LOG.debug("Response: %s" % api_response)
|
||||
return api_response.items
|
||||
except Exception as e:
|
||||
LOG.error("Kubernetes exception in kube_get_nodes: %s" % e)
|
||||
raise
|
||||
|
|
|
@ -532,10 +532,15 @@ class AppOperator(object):
|
|||
}
|
||||
}
|
||||
body['metadata']['labels'].update(label_dict)
|
||||
self._kube.kube_patch_node(hostname, body)
|
||||
try:
|
||||
self._kube.kube_patch_node(hostname, body)
|
||||
except exception.K8sNodeNotFound:
|
||||
pass
|
||||
|
||||
def _assign_host_labels(self, hosts, labels):
|
||||
for host in hosts:
|
||||
if host.administrative != constants.ADMIN_LOCKED:
|
||||
continue
|
||||
for label_str in labels:
|
||||
k, v = label_str.split('=')
|
||||
try:
|
||||
|
@ -557,6 +562,8 @@ class AppOperator(object):
|
|||
|
||||
def _remove_host_labels(self, hosts, labels):
|
||||
for host in hosts:
|
||||
if host.administrative != constants.ADMIN_LOCKED:
|
||||
continue
|
||||
null_labels = {}
|
||||
for label_str in labels:
|
||||
lbl_obj = self._find_label(host.uuid, label_str)
|
||||
|
@ -941,43 +948,60 @@ class AppOperator(object):
|
|||
|
||||
if self._make_armada_request_with_monitor(app, constants.APP_DELETE_OP):
|
||||
if app.system_app:
|
||||
try:
|
||||
# TODO convert these kubectl commands to use the k8s api
|
||||
p1 = subprocess.Popen(
|
||||
['kubectl', '--kubeconfig=/etc/kubernetes/admin.conf',
|
||||
'get', 'pvc', '--no-headers', '-n', 'openstack'],
|
||||
stdout=subprocess.PIPE)
|
||||
p2 = subprocess.Popen(['awk', '{print $3}'],
|
||||
stdin=p1.stdout,
|
||||
stdout=subprocess.PIPE)
|
||||
p3 = subprocess.Popen(
|
||||
['xargs', '-i', 'kubectl',
|
||||
'--kubeconfig=/etc/kubernetes/admin.conf', 'delete',
|
||||
'pv', '{}', '--wait=false'],
|
||||
stdin=p2.stdout,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE)
|
||||
|
||||
# TODO convert these kubectl commands to use the k8s api
|
||||
p1 = subprocess.Popen(
|
||||
['kubectl', '--kubeconfig=/etc/kubernetes/admin.conf',
|
||||
'get', 'pvc', '--no-headers', '-n', 'openstack'],
|
||||
stdout=subprocess.PIPE)
|
||||
p2 = subprocess.Popen(['awk', '{print $3}'],
|
||||
stdin=p1.stdout,
|
||||
stdout=subprocess.PIPE)
|
||||
p3 = subprocess.Popen(
|
||||
['xargs', '-i', 'kubectl',
|
||||
'--kubeconfig=/etc/kubernetes/admin.conf', 'delete',
|
||||
'pv', '{}', '--wait=false'],
|
||||
stdin=p2.stdout,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE)
|
||||
timer = threading.Timer(10, p3.kill)
|
||||
try:
|
||||
timer.start()
|
||||
p1.stdout.close()
|
||||
p2.stdout.close()
|
||||
out, err = p3.communicate()
|
||||
if not err:
|
||||
if out and not err:
|
||||
LOG.info("Persistent Volumes marked for deletion.")
|
||||
else:
|
||||
self._abort_operation(app, constants.APP_REMOVE_OP)
|
||||
LOG.error("Failed to clean up PVs after app removal.")
|
||||
except Exception as e:
|
||||
self._abort_operation(app, constants.APP_REMOVE_OP)
|
||||
LOG.exception("Failed to clean up PVs after app "
|
||||
"removal: %s" % e)
|
||||
finally:
|
||||
timer.cancel()
|
||||
|
||||
p4 = subprocess.Popen(
|
||||
['kubectl', '--kubeconfig=/etc/kubernetes/admin.conf',
|
||||
'delete', 'namespace', 'openstack'],
|
||||
stdout=subprocess.PIPE)
|
||||
timer2 = threading.Timer(10, p4.kill)
|
||||
try:
|
||||
p1 = subprocess.Popen(
|
||||
['kubectl', '--kubeconfig=/etc/kubernetes/admin.conf',
|
||||
'delete', 'namespace', 'openstack'],
|
||||
stdout=subprocess.PIPE)
|
||||
out, err = p1.communicate()
|
||||
if not err:
|
||||
timer2.start()
|
||||
out, err = p4.communicate()
|
||||
if out and not err:
|
||||
LOG.info("Openstack namespace delete completed.")
|
||||
else:
|
||||
self._abort_operation(app, constants.APP_REMOVE_OP)
|
||||
LOG.error("Failed to clean up openstack namespace"
|
||||
" after app removal.")
|
||||
except Exception as e:
|
||||
self._abort_operation(app, constants.APP_REMOVE_OP)
|
||||
LOG.exception("Failed to clean up openstack namespace "
|
||||
"after app removal: %s" % e)
|
||||
finally:
|
||||
timer2.cancel()
|
||||
|
||||
self._update_app_status(app, constants.APP_UPLOAD_SUCCESS)
|
||||
LOG.info("Application (%s) remove completed." % app.name)
|
||||
|
|
|
@ -4937,12 +4937,45 @@ class ConductorManager(service.PeriodicService):
|
|||
# Audit install states
|
||||
self._audit_install_states(hosts)
|
||||
|
||||
# Audit kubernetes node labels
|
||||
self._audit_kubernetes_labels(hosts)
|
||||
|
||||
for host in hosts:
|
||||
# only audit configured hosts
|
||||
if not host.personality:
|
||||
continue
|
||||
self._audit_ihost_action(host)
|
||||
|
||||
def _audit_kubernetes_labels(self, hosts):
|
||||
if not utils.is_kubernetes_config(self.dbapi):
|
||||
LOG.debug("_audit_kubernetes_labels skip")
|
||||
return
|
||||
|
||||
LOG.debug("Starting kubernetes label audit")
|
||||
sysinv_labels = self.dbapi.label_get_all()
|
||||
nodes = self._kube.kube_get_nodes()
|
||||
|
||||
for host in hosts:
|
||||
try:
|
||||
for node in nodes:
|
||||
if host.hostname == node.metadata.name:
|
||||
node_labels = node.metadata.labels
|
||||
host_labels = [l for l in sysinv_labels if l.host_id == host.id]
|
||||
for host_label in host_labels:
|
||||
if host_label.label_key not in node_labels.keys():
|
||||
LOG.info("Label audit: creating %s=%s on node %s"
|
||||
% (host_label.label_key,
|
||||
host_label.label_value, host.hostname))
|
||||
body = {
|
||||
'metadata': {
|
||||
'labels': {host_label.label_key: host_label.label_value}
|
||||
}
|
||||
}
|
||||
self._kube.kube_patch_node(host.hostname, body)
|
||||
except Exception as e:
|
||||
LOG.warning("Failed to sync kubernetes label to host %s: %s" %
|
||||
(host.hostname, e))
|
||||
|
||||
# TODO(CephPoolsDecouple): remove
|
||||
@periodic_task.periodic_task(spacing=60)
|
||||
def _osd_pool_audit(self, context):
|
||||
|
@ -10530,7 +10563,11 @@ class ConductorManager(service.PeriodicService):
|
|||
}
|
||||
}
|
||||
body['metadata']['labels'].update(label_dict)
|
||||
self._kube.kube_patch_node(host.hostname, body)
|
||||
try:
|
||||
self._kube.kube_patch_node(host.hostname, body)
|
||||
except exception.K8sNodeNotFound:
|
||||
LOG.info("Host %s does not exist in kubernetes yet, label will "
|
||||
"be added after node's unlock by audit" % host.hostname)
|
||||
|
||||
def update_host_memory(self, context, host_uuid):
|
||||
try:
|
||||
|
|
|
@ -152,7 +152,8 @@ class NeutronHelm(openstack.OpenstackBaseHelm):
|
|||
hosts = self.dbapi.ihost_get_list()
|
||||
|
||||
for host in hosts:
|
||||
if (host.invprovision == constants.PROVISIONED):
|
||||
if (host.invprovision in [constants.PROVISIONED,
|
||||
constants.PROVISIONING]):
|
||||
if constants.WORKER in utils.get_personalities(host):
|
||||
|
||||
hostname = str(host.hostname)
|
||||
|
|
|
@ -422,7 +422,8 @@ class NovaHelm(openstack.OpenstackBaseHelm):
|
|||
hosts = self.dbapi.ihost_get_list()
|
||||
|
||||
for host in hosts:
|
||||
if (host.invprovision == constants.PROVISIONED):
|
||||
if (host.invprovision in [constants.PROVISIONED,
|
||||
constants.PROVISIONING]):
|
||||
if constants.WORKER in utils.get_personalities(host):
|
||||
|
||||
hostname = str(host.hostname)
|
||||
|
|
Loading…
Reference in New Issue