Browse Source

[fedora-atomic-k8s] Adding Node Problem Detector

Deploying Node Problem Detector to all nodes to detect problems which
can be leverage by auto healing. This is the first step of enabling
the auto healing feature.

Task: 29886
Story: 2004782

Change-Id: I1b6075025c5f369821b4136783e68b16535dc6ef
changes/02/641902/4
Feilong Wang 2 years ago
parent
commit
c39f1150e5
  1. 117
      magnum/drivers/common/templates/kubernetes/fragments/enable-auto-healing.sh
  2. 1
      magnum/drivers/common/templates/kubernetes/fragments/write-heat-params-master.yaml
  3. 3
      magnum/drivers/heat/k8s_fedora_template_def.py
  4. 7
      magnum/drivers/k8s_fedora_atomic_v1/templates/kubecluster.yaml
  5. 5
      magnum/drivers/k8s_fedora_atomic_v1/templates/kubemaster.yaml
  6. 4
      magnum/tests/unit/drivers/test_template_definition.py

117
magnum/drivers/common/templates/kubernetes/fragments/enable-auto-healing.sh

@ -0,0 +1,117 @@
#!/bin/sh
step="enable-auto-healing"
printf "Starting to run ${step}\n"
. /etc/sysconfig/heat-params
_gcr_prefix=${CONTAINER_INFRA_PREFIX:-k8s.gcr.io/}
# Generate Node Problem Detector manifest file
NPD_DEPLOY=/srv/magnum/kubernetes/manifests/npd.yaml
[ -f ${NPD_DEPLOY} ] || {
echo "Writing File: $NPD_DEPLOY"
mkdir -p $(dirname ${NPD_DEPLOY})
cat << EOF > ${NPD_DEPLOY}
apiVersion: v1
kind: ServiceAccount
metadata:
name: node-problem-detector
namespace: kube-system
labels:
kubernetes.io/cluster-service: "true"
addonmanager.kubernetes.io/mode: Reconcile
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: npd-binding
labels:
kubernetes.io/cluster-service: "true"
addonmanager.kubernetes.io/mode: Reconcile
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: system:node-problem-detector
subjects:
- kind: ServiceAccount
name: node-problem-detector
namespace: kube-system
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: npd
namespace: kube-system
labels:
k8s-app: node-problem-detector
version: ${NODE_PROBLEM_DETECTOR_TAG}
kubernetes.io/cluster-service: "true"
addonmanager.kubernetes.io/mode: Reconcile
spec:
selector:
matchLabels:
k8s-app: node-problem-detector
version: ${NODE_PROBLEM_DETECTOR_TAG}
template:
metadata:
labels:
k8s-app: node-problem-detector
version: ${NODE_PROBLEM_DETECTOR_TAG}
kubernetes.io/cluster-service: "true"
spec:
containers:
- name: node-problem-detector
image: ${_gcr_prefix}node-problem-detector:${NODE_PROBLEM_DETECTOR_TAG}
command:
- "/bin/sh"
- "-c"
# Pass both config to support both journald and syslog.
- "exec /node-problem-detector --logtostderr --system-log-monitors=/config/kernel-monitor.json,/config/kernel-monitor-filelog.json,/config/docker-monitor.json,/config/docker-monitor-filelog.json >>/var/log/node-problem-detector.log 2>&1"
securityContext:
privileged: true
resources:
limits:
cpu: "200m"
memory: "100Mi"
requests:
cpu: "20m"
memory: "20Mi"
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
volumeMounts:
- name: log
mountPath: /var/log
- name: localtime
mountPath: /etc/localtime
readOnly: true
volumes:
- name: log
hostPath:
path: /var/log/
- name: localtime
hostPath:
path: /etc/localtime
type: "FileOrCreate"
serviceAccountName: node-problem-detector
tolerations:
- operator: "Exists"
effect: "NoExecute"
- key: "CriticalAddonsOnly"
operator: "Exists"
EOF
}
echo "Waiting for Kubernetes API..."
until [ "ok" = "$(curl --silent http://127.0.0.1:8080/healthz)" ]
do
sleep 5
done
kubectl apply -f ${NPD_DEPLOY}
printf "Finished running ${step}\n"

1
magnum/drivers/common/templates/kubernetes/fragments/write-heat-params-master.yaml

@ -86,3 +86,4 @@ write_files:
TILLER_ENABLED="$TILLER_ENABLED"
TILLER_TAG="$TILLER_TAG"
TILLER_NAMESPACE="$TILLER_NAMESPACE"
NODE_PROBLEM_DETECTOR_TAG="$NODE_PROBLEM_DETECTOR_TAG"

3
magnum/drivers/heat/k8s_fedora_template_def.py

@ -118,7 +118,8 @@ class K8sFedoraTemplateDefinition(k8s_template_def.K8sTemplateDefinition):
'keystone_auth_enabled', 'k8s_keystone_auth_tag',
'tiller_enabled',
'tiller_tag',
'tiller_namespace']
'tiller_namespace',
'node_problem_detector_tag']
for label in label_list:
label_value = cluster.labels.get(label)

7
magnum/drivers/k8s_fedora_atomic_v1/templates/kubecluster.yaml

@ -555,6 +555,11 @@ parameters:
description: namespace where tiller will be installed.
default: "magnum-tiller"
node_problem_detector_tag:
type: string
description: tag of the node problem detector container
default: v0.6.2
resources:
######################################################################
@ -818,6 +823,7 @@ resources:
tiller_enabled: {get_param: tiller_enabled}
tiller_tag: {get_param: tiller_tag}
tiller_namespace: {get_param: tiller_namespace}
node_problem_detector_tag: {get_param: node_problem_detector_tag}
kube_cluster_config:
type: OS::Heat::SoftwareConfig
@ -848,6 +854,7 @@ resources:
template: {get_file: ../../common/templates/kubernetes/fragments/enable-ingress-controller.sh}
- get_file: ../../common/templates/kubernetes/fragments/kube-dashboard-service.sh
- get_file: ../../common/templates/kubernetes/fragments/enable-keystone-auth.sh
- get_file: ../../common/templates/kubernetes/fragments/enable-auto-healing.sh
kube_cluster_deploy:
type: OS::Heat::SoftwareDeployment

5
magnum/drivers/k8s_fedora_atomic_v1/templates/kubemaster.yaml

@ -430,6 +430,10 @@ parameters:
type: string
description: namespace where tiller will be installed
node_problem_detector_tag:
type: string
description: tag of the node problem detector container
resources:
######################################################################
#
@ -539,6 +543,7 @@ resources:
"$TILLER_ENABLED": {get_param: tiller_enabled}
"$TILLER_TAG": {get_param: tiller_tag}
"$TILLER_NAMESPACE": {get_param: tiller_namespace}
"$NODE_PROBLEM_DETECTOR_TAG": {get_param: node_problem_detector_tag}
install_openstack_ca:
type: OS::Heat::SoftwareConfig

4
magnum/tests/unit/drivers/test_template_definition.py

@ -424,6 +424,7 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase):
'tiller_tag')
tiller_namespace = mock_cluster.labels.get(
'tiller_namespace')
npd_tag = mock_cluster.labels.get('node_problem_detector_tag')
k8s_def = k8sa_tdef.AtomicK8sTemplateDefinition()
@ -484,6 +485,7 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase):
'tiller_enabled': tiller_enabled,
'tiller_tag': tiller_tag,
'tiller_namespace': tiller_namespace,
'node_problem_detector_tag': npd_tag,
}}
mock_get_params.assert_called_once_with(mock_context,
mock_cluster_template,
@ -790,6 +792,7 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase):
'tiller_tag')
tiller_namespace = mock_cluster.labels.get(
'tiller_namespace')
npd_tag = mock_cluster.labels.get('node_problem_detector_tag')
k8s_def = k8sa_tdef.AtomicK8sTemplateDefinition()
@ -852,6 +855,7 @@ class AtomicK8sTemplateDefinitionTestCase(BaseK8sTemplateDefinitionTestCase):
'tiller_enabled': tiller_enabled,
'tiller_tag': tiller_tag,
'tiller_namespace': tiller_namespace,
'node_problem_detector_tag': npd_tag,
}}
mock_get_params.assert_called_once_with(mock_context,
mock_cluster_template,

Loading…
Cancel
Save