Trigger application reapply on host unlock/delete

- Triggers an application reapply (including override
  regeneration) on node unlock to pick up a new node being added
  or configuration changes being made.
- The reapply also triggers on node delete if the host
  had a compute node label to remove any per-host overrides.
- Turned on the restriction that nodes must be locked to modify
  labels.  Added an audit task to sync any labels made before
  a node is unlocked and missing from k8s.
- Modified k8s puppet manifest to only launch kubelet on the
  initial configuration, after that the service file is
  modified to have a dependancy on the config gate.
  This is to avoid PLEG errors in kubernetes due to the node
  being overwhelmed during boot.

Change-Id: I1d9ca92f451aa322765da43ffcbb1d95f97f92f2
Story: 2004520
Task: 28826
Signed-off-by: Tyler Smith <tyler.smith@windriver.com>
This commit is contained in:
Tyler Smith 2019-01-10 11:37:27 -05:00
parent 3f6e511ec3
commit 5624c74062
11 changed files with 197 additions and 36 deletions

View File

@ -39,7 +39,6 @@ class platform::kubernetes::kubeadm {
}
# Start kubelet.
-> service { 'kubelet':
ensure => 'running',
enable => true,
}
# A seperate enable is required since we have modified the service resource
@ -124,6 +123,21 @@ class platform::kubernetes::master::init
command => "kubectl --kubeconfig=/etc/kubernetes/admin.conf taint node ${::platform::params::hostname} node-role.kubernetes.io/master-", # lint:ignore:140chars
logoutput => true,
}
# Add a dependency to kubelet on config so it doesn't enter a bad state on subsequent boots
-> file { '/etc/systemd/system/kubelet.service.d/kube-stx-override.conf':
ensure => file,
content => template('platform/kube-stx-override.conf.erb'),
owner => 'root',
group => 'root',
mode => '0644',
}
# Reload systemd
-> exec { 'perform systemctl daemon reload for kubelet override':
command => 'systemctl daemon-reload',
logoutput => true,
}
} else {
if str2bool($::is_initial_config) {
# For subsequent controller installs, install kubernetes using the
@ -206,6 +220,21 @@ class platform::kubernetes::master::init
command => "kubectl --kubeconfig=/etc/kubernetes/admin.conf taint node ${::platform::params::hostname} node-role.kubernetes.io/master-", # lint:ignore:140chars
logoutput => true,
}
# Add a dependency to kubelet on config so it doesn't enter a bad state on subsequent boots
-> file { '/etc/systemd/system/kubelet.service.d/kube-stx-override.conf':
ensure => file,
content => template('platform/kube-stx-override.conf.erb'),
owner => 'root',
group => 'root',
mode => '0644',
}
# Reload systemd
-> exec { 'perform systemctl daemon reload for kubelet override':
command => 'systemctl daemon-reload',
logoutput => true,
}
}
}
}
@ -242,6 +271,21 @@ class platform::kubernetes::worker::init
logoutput => true,
unless => 'test -f /etc/kubernetes/kubelet.conf',
}
# Add a dependency to kubelet on config so it doesn't enter a bad state
-> file { '/etc/systemd/system/kubelet.service.d/kube-stx-override.conf':
ensure => file,
content => template('platform/kube-stx-override.conf.erb'),
owner => 'root',
group => 'root',
mode => '0644',
}
# Reload systemd
-> exec { 'perform systemctl daemon reload for kubelet override':
command => 'systemctl daemon-reload',
logoutput => true,
}
}
class platform::kubernetes::worker

View File

@ -0,0 +1,2 @@
[Unit]
After=config.service

View File

@ -2110,6 +2110,11 @@ class HostController(rest.RestController):
ihost_obj['uuid'],
ibm_msg_dict)
# Trigger a system app reapply if the host has been unlocked
if (utils.is_kubernetes_config() and patched_ihost.get('action') in
[constants.UNLOCK_ACTION, constants.FORCE_UNLOCK_ACTION]):
self._reapply_system_app()
elif mtc_response['status'] is None:
raise wsme.exc.ClientSideError(
_("Timeout waiting for maintenance response. "
@ -2341,6 +2346,15 @@ class HostController(rest.RestController):
# wait for VIM signal
return
openstack_worker = False
if utils.is_kubernetes_config():
labels = objects.label.get_by_host_id(pecan.request.context, ihost.uuid)
for l in labels:
if (constants.COMPUTE_NODE_LABEL ==
str(l.label_key) + '=' + str(l.label_value)):
openstack_worker = True
break
idict = {'operation': constants.DELETE_ACTION,
'uuid': ihost.uuid,
'invprovision': ihost.invprovision}
@ -2464,6 +2478,32 @@ class HostController(rest.RestController):
pecan.request.dbapi.ihost_destroy(ihost_id)
# If the host being removed was an openstack worker node, trigger
# a reapply
if openstack_worker:
self._reapply_system_app()
def _reapply_system_app(self):
try:
db_app = objects.kube_app.get_by_name(
pecan.request.context, constants.HELM_APP_OPENSTACK)
if db_app.status == constants.APP_APPLY_SUCCESS:
LOG.info(
"Reapplying the %s app" % constants.HELM_APP_OPENSTACK)
db_app.status = constants.APP_APPLY_IN_PROGRESS
db_app.progress = None
db_app.save()
pecan.request.rpcapi.perform_app_apply(
pecan.request.context, db_app)
else:
LOG.info("%s system app is present but not applied, "
"skipping re-apply" % constants.HELM_APP_OPENSTACK)
except exception.KubeAppNotFound:
LOG.info(
"%s system app not present, skipping re-apply" %
constants.HELM_APP_OPENSTACK)
def _check_upgrade_provision_order(self, personality, hostname):
LOG.info("_check_upgrade_provision_order personality=%s, hostname=%s" %
(personality, hostname))

View File

@ -288,11 +288,5 @@ class LabelController(rest.RestController):
# UTILS
###########
def _check_host_locked(host):
# TODO(ksmith):
# turn this on later
return
if (utils.is_aio_simplex_host_unlocked(host) or
host.administrative != constants.ADMIN_LOCKED):
if host.administrative != constants.ADMIN_LOCKED:
raise wsme.exc.ClientSideError(_("Host must be locked."))

View File

@ -123,6 +123,7 @@ HOST_DELETE = 'host_delete' # for personality sub-type validation
# Availability
AVAILABILITY_AVAILABLE = 'available'
AVAILABILITY_INTEST = 'intest'
AVAILABILITY_OFFLINE = 'offline'
AVAILABILITY_ONLINE = 'online'
AVAILABILITY_DEGRADED = 'degraded'

View File

@ -1170,6 +1170,10 @@ class HostLabelInvalid(Invalid):
message = _("Host label is invalid. Reason: %(reason)s")
class K8sNodeNotFound(NotFound):
message = _("Kubernetes Node %(name)s could not be found.")
class PickleableException(Exception):
"""
Pickleable Exception

View File

@ -51,6 +51,19 @@ class KubeOperator(object):
if e.status == httplib.UNPROCESSABLE_ENTITY:
reason = json.loads(e.body).get('message', "")
raise exception.HostLabelInvalid(reason=reason)
elif e.status == httplib.NOT_FOUND:
raise exception.K8sNodeNotFound(name=name)
else:
raise
except Exception as e:
LOG.error("Kubernetes exception: %s" % e)
LOG.error("Kubernetes exception in kube_patch_node: %s" % e)
raise
def kube_get_nodes(self):
try:
api_response = self._get_kubernetesclient().list_node()
LOG.debug("Response: %s" % api_response)
return api_response.items
except Exception as e:
LOG.error("Kubernetes exception in kube_get_nodes: %s" % e)
raise

View File

@ -532,10 +532,15 @@ class AppOperator(object):
}
}
body['metadata']['labels'].update(label_dict)
self._kube.kube_patch_node(hostname, body)
try:
self._kube.kube_patch_node(hostname, body)
except exception.K8sNodeNotFound:
pass
def _assign_host_labels(self, hosts, labels):
for host in hosts:
if host.administrative != constants.ADMIN_LOCKED:
continue
for label_str in labels:
k, v = label_str.split('=')
try:
@ -557,6 +562,8 @@ class AppOperator(object):
def _remove_host_labels(self, hosts, labels):
for host in hosts:
if host.administrative != constants.ADMIN_LOCKED:
continue
null_labels = {}
for label_str in labels:
lbl_obj = self._find_label(host.uuid, label_str)
@ -941,43 +948,60 @@ class AppOperator(object):
if self._make_armada_request_with_monitor(app, constants.APP_DELETE_OP):
if app.system_app:
try:
# TODO convert these kubectl commands to use the k8s api
p1 = subprocess.Popen(
['kubectl', '--kubeconfig=/etc/kubernetes/admin.conf',
'get', 'pvc', '--no-headers', '-n', 'openstack'],
stdout=subprocess.PIPE)
p2 = subprocess.Popen(['awk', '{print $3}'],
stdin=p1.stdout,
stdout=subprocess.PIPE)
p3 = subprocess.Popen(
['xargs', '-i', 'kubectl',
'--kubeconfig=/etc/kubernetes/admin.conf', 'delete',
'pv', '{}', '--wait=false'],
stdin=p2.stdout,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
# TODO convert these kubectl commands to use the k8s api
p1 = subprocess.Popen(
['kubectl', '--kubeconfig=/etc/kubernetes/admin.conf',
'get', 'pvc', '--no-headers', '-n', 'openstack'],
stdout=subprocess.PIPE)
p2 = subprocess.Popen(['awk', '{print $3}'],
stdin=p1.stdout,
stdout=subprocess.PIPE)
p3 = subprocess.Popen(
['xargs', '-i', 'kubectl',
'--kubeconfig=/etc/kubernetes/admin.conf', 'delete',
'pv', '{}', '--wait=false'],
stdin=p2.stdout,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
timer = threading.Timer(10, p3.kill)
try:
timer.start()
p1.stdout.close()
p2.stdout.close()
out, err = p3.communicate()
if not err:
if out and not err:
LOG.info("Persistent Volumes marked for deletion.")
else:
self._abort_operation(app, constants.APP_REMOVE_OP)
LOG.error("Failed to clean up PVs after app removal.")
except Exception as e:
self._abort_operation(app, constants.APP_REMOVE_OP)
LOG.exception("Failed to clean up PVs after app "
"removal: %s" % e)
finally:
timer.cancel()
p4 = subprocess.Popen(
['kubectl', '--kubeconfig=/etc/kubernetes/admin.conf',
'delete', 'namespace', 'openstack'],
stdout=subprocess.PIPE)
timer2 = threading.Timer(10, p4.kill)
try:
p1 = subprocess.Popen(
['kubectl', '--kubeconfig=/etc/kubernetes/admin.conf',
'delete', 'namespace', 'openstack'],
stdout=subprocess.PIPE)
out, err = p1.communicate()
if not err:
timer2.start()
out, err = p4.communicate()
if out and not err:
LOG.info("Openstack namespace delete completed.")
else:
self._abort_operation(app, constants.APP_REMOVE_OP)
LOG.error("Failed to clean up openstack namespace"
" after app removal.")
except Exception as e:
self._abort_operation(app, constants.APP_REMOVE_OP)
LOG.exception("Failed to clean up openstack namespace "
"after app removal: %s" % e)
finally:
timer2.cancel()
self._update_app_status(app, constants.APP_UPLOAD_SUCCESS)
LOG.info("Application (%s) remove completed." % app.name)

View File

@ -4937,12 +4937,45 @@ class ConductorManager(service.PeriodicService):
# Audit install states
self._audit_install_states(hosts)
# Audit kubernetes node labels
self._audit_kubernetes_labels(hosts)
for host in hosts:
# only audit configured hosts
if not host.personality:
continue
self._audit_ihost_action(host)
def _audit_kubernetes_labels(self, hosts):
if not utils.is_kubernetes_config(self.dbapi):
LOG.debug("_audit_kubernetes_labels skip")
return
LOG.debug("Starting kubernetes label audit")
sysinv_labels = self.dbapi.label_get_all()
nodes = self._kube.kube_get_nodes()
for host in hosts:
try:
for node in nodes:
if host.hostname == node.metadata.name:
node_labels = node.metadata.labels
host_labels = [l for l in sysinv_labels if l.host_id == host.id]
for host_label in host_labels:
if host_label.label_key not in node_labels.keys():
LOG.info("Label audit: creating %s=%s on node %s"
% (host_label.label_key,
host_label.label_value, host.hostname))
body = {
'metadata': {
'labels': {host_label.label_key: host_label.label_value}
}
}
self._kube.kube_patch_node(host.hostname, body)
except Exception as e:
LOG.warning("Failed to sync kubernetes label to host %s: %s" %
(host.hostname, e))
# TODO(CephPoolsDecouple): remove
@periodic_task.periodic_task(spacing=60)
def _osd_pool_audit(self, context):
@ -10530,7 +10563,11 @@ class ConductorManager(service.PeriodicService):
}
}
body['metadata']['labels'].update(label_dict)
self._kube.kube_patch_node(host.hostname, body)
try:
self._kube.kube_patch_node(host.hostname, body)
except exception.K8sNodeNotFound:
LOG.info("Host %s does not exist in kubernetes yet, label will "
"be added after node's unlock by audit" % host.hostname)
def update_host_memory(self, context, host_uuid):
try:

View File

@ -152,7 +152,8 @@ class NeutronHelm(openstack.OpenstackBaseHelm):
hosts = self.dbapi.ihost_get_list()
for host in hosts:
if (host.invprovision == constants.PROVISIONED):
if (host.invprovision in [constants.PROVISIONED,
constants.PROVISIONING]):
if constants.WORKER in utils.get_personalities(host):
hostname = str(host.hostname)

View File

@ -422,7 +422,8 @@ class NovaHelm(openstack.OpenstackBaseHelm):
hosts = self.dbapi.ihost_get_list()
for host in hosts:
if (host.invprovision == constants.PROVISIONED):
if (host.invprovision in [constants.PROVISIONED,
constants.PROVISIONING]):
if constants.WORKER in utils.get_personalities(host):
hostname = str(host.hostname)