Add cronjob to osd audit

After host-swact, mon and osd deployment on active controller will be deleted, then drbd could make /var/lib/ceph/mon folder primary and secondary switch on two controllers. Then after swact, a new mon and osd deployment will launch by rook-ceph-operator. But during this process, one controller suddenly shutdown, there will be only osd deployment on power on controller, even the other one controller later power on to recovery. So add a cron job to make osd deployment status check to ensure cluster health. Partial-Bug: 1920882 Depends-On: I4afde8b1476e14453fac8561f1edde7360b8ee96 Change-Id: I39cb66daecf4052821ceb28344a90ea70f63a742 Signed-off-by: Chen, Haochuan Z <haochuan.z.chen@intel.com>
2021-03-29 08:16:49 +08:00 · 2021-03-29 08:16:49 +08:00 · 5945b9b82a
commit 5945b9b82a
parent 015c8b5b54
6 changed files with 308 additions and 10 deletions
--- a/python-k8sapp-rook/k8sapp_rook/k8sapp_rook/helm/rook_ceph_provisioner.py
+++ b/python-k8sapp-rook/k8sapp_rook/k8sapp_rook/helm/rook_ceph_provisioner.py
@ -82,7 +82,7 @@ class RookCephProvisionerHelm(base.BaseHelm):
                "host_provision": {
                    "controller_hosts": self._get_controller_hosts(),
                },
-                "ceph_mon_audit_jobs": self._get_ceph_mon_audit(),
+                "ceph_audit_jobs": self._get_ceph_audit(),
            }
        }

@ -149,7 +149,7 @@ class RookCephProvisionerHelm(base.BaseHelm):

        return controller_hosts

-    def _get_ceph_mon_audit(self):
+    def _get_ceph_audit(self):
        audit = {}

        if utils.is_aio_duplex_system(self.dbapi):
--- a/python-k8sapp-rook/k8sapp_rook/tox.ini
+++ b/python-k8sapp-rook/k8sapp_rook/tox.ini
@ -10,11 +10,12 @@ stxdir = {toxinidir}/../../..
 distshare={toxworkdir}/.tox/distshare

 [testenv]
-# usedevelop = True
 # enabling usedevelop results in  py27 develop-inst:
 # Exception: Versioning for this project requires either an sdist tarball,
 # or access to an upstream git repository.
 # Note. site-packages is true and rpm-python must be yum installed on your dev machine.
+usedevelop = True
+basepython = python3
 sitepackages = True

 # tox is silly... these need to be separated by a newline....
@ -99,7 +100,7 @@ deps = -r{toxinidir}/test-requirements.txt
 commands = bandit --ini tox.ini -n 5 -r k8sapp_rook

 [testenv:pylint]
-basepython = python2.7
+basepython = python3
 sitepackages = False
 deps = {[testenv]deps}
 commands =
--- a/stx-rook-ceph/stx-rook-ceph/helm-charts/rook-ceph-provisioner/templates/_helpers.tpl
+++ b/stx-rook-ceph/stx-rook-ceph/helm-charts/rook-ceph-provisioner/templates/_helpers.tpl
@ -0,0 +1,196 @@
+{{- define "script.osd_audit" -}}
+#!/usr/bin/env python
+
+import os
+import subprocess
+
+from cephclient import wrapper
+from kubernetes import config
+from kubernetes import client
+from kubernetes.client import Configuration
+from kubernetes.client.rest import ApiException
+from six.moves import http_client as httplib
+from cephclient import wrapper
+
+# Kubernetes Files
+KUBERNETES_ADMIN_CONF = '/etc/kubernetes/admin.conf'
+
+CEPH_MGR_PORT = 7999
+
+def is_k8s_configured():
+    """Check to see if the k8s admin config file exists."""
+    if os.path.isfile(KUBERNETES_ADMIN_CONF):
+        return True
+    return False
+
+class KubeOperator(object):
+
+    def __init__(self):
+        self._kube_client_batch = None
+        self._kube_client_core = None
+        self._kube_client_custom_objects = None
+
+    def _load_kube_config(self):
+        if not is_k8s_configured():
+            raise exception.KubeNotConfigured()
+
+        config.load_kube_config(KUBERNETES_ADMIN_CONF)
+
+        # Workaround: Turn off SSL/TLS verification
+        c = Configuration()
+        c.verify_ssl = False
+        Configuration.set_default(c)
+
+    def _get_kubernetesclient_core(self):
+        if not self._kube_client_core:
+            self._load_kube_config()
+            self._kube_client_core = client.CoreV1Api()
+        return self._kube_client_core
+
+    def _get_kubernetesclient_custom_objects(self):
+        if not self._kube_client_custom_objects:
+            self._load_kube_config()
+            self._kube_client_custom_objects = client.CustomObjectsApi()
+        return self._kube_client_custom_objects
+
+    def kube_get_nodes(self):
+        try:
+            api_response = self._get_kubernetesclient_core().list_node()
+            return api_response.items
+        except ApiException as e:
+            print("Kubernetes exception in kube_get_nodes: %s" % e)
+            raise
+
+    def kube_get_pods_by_selector(self, namespace, label_selector,
+                                  field_selector):
+        c = self._get_kubernetesclient_core()
+        try:
+            api_response = c.list_namespaced_pod(namespace,
+                label_selector="%s" % label_selector,
+                field_selector="%s" % field_selector)
+            return api_response.items
+        except ApiException as e:
+            print("Kubernetes exception in "
+                  "kube_get_pods_by_selector %s/%s/%s: %s",
+                  namespace, label_selector, field_selector, e)
+
+            return None
+
+    def kube_delete_pod(self, name, namespace, **kwargs):
+        body = {}
+
+        if kwargs:
+            body.update(kwargs)
+
+        c = self._get_kubernetesclient_core()
+        try:
+            api_response = c.delete_namespaced_pod(name, namespace, body)
+            return True
+        except ApiException as e:
+            if e.status == httplib.NOT_FOUND:
+                print("Pod %s/%s not found." % (namespace, name))
+                return False
+            else:
+                print("Failed to delete Pod %s/%s: " "%s" % (namespace, name, e.body))
+                raise
+
+    def get_custom_resource(self, group, version, namespace, plural, name):
+        c = self._get_kubernetesclient_custom_objects()
+
+        try:
+            api_response = c.list_namespaced_custom_object(group, version, namespace,
+                plural)
+            return api_response
+        except ApiException as ex:
+            if ex.reason == "Not Found":
+                print("Failed to delete custom object, Namespace %s: %s" % (namespace, str(ex.body).replace('\n', ' ')))
+                pass
+
+        return None
+
+def osd_audit():
+    kube = KubeOperator()
+    group = "ceph.rook.io"
+    version = "v1"
+    namespace = "kube-system"
+    plural = "cephclusters"
+    name = "cephclusters.ceph.rook.io.ceph-cluster"
+
+    try:
+        ceph_api = wrapper.CephWrapper(endpoint='http://localhost:{}'.format(CEPH_MGR_PORT))
+        response, body = ceph_api.health(body='text', timeout=30)
+        if body == "HEALTH_OK":
+            print("Cluster reports HEALTH_OK")
+            return
+        print(body)
+    except IOError as e:
+        print("Accessing Ceph API failed. Cluster health unknown. Proceeding.")
+        pass
+
+    cluster = {}
+    try:
+        cephcluster = kube.get_custom_resource(group, version, namespace, plural, name)
+        if 'items' in cephcluster:
+            cluster = cephcluster['items'][0]
+    except ApiException as ex:
+        if ex.reason == "Not Found":
+            print("Failed to delete custom object, Namespace %s: %s" % (namespace, str(ex.body).replace('\n', ' ')))
+            pass
+
+    health = ""
+    if cluster and cluster.has_key("status") and cluster["status"].has_key("ceph") and cluster['status']['ceph'].has_key("health"):
+        health = cluster['status']['ceph']['health']
+    else:
+        print("Failed to get cluster['status']['ceph']['health']")
+        return
+
+    if health != "HEALTH_OK":
+        delete_operator = False
+        osd_nodes = cluster['spec']['storage']['nodes']
+        nodes = {}
+
+        node_list = kube.kube_get_nodes()
+        for item in node_list:
+            nodes[item.metadata.name] = item.spec.taints
+
+        for n in osd_nodes:
+            # get osd info declare in ceph cluster
+            node_name = n['name']
+            osd_devices = n['devices']
+
+            # check whether there is osd pod running described in cephcluster osd_nodes
+            label = "app=rook-ceph-osd,failure-domain=%s" % node_name
+            pods = kube.kube_get_pods_by_selector(namespace, label, "")
+
+            osd_pods = []
+            for pod in pods:
+                if pod.status.phase == 'Running':
+                    osd_pods.append(pod)
+
+            if len(osd_devices) != len(osd_pods) :
+                # assume when osd pod number is not equal with this node osd device
+                # operator should reset
+                delete_operator = True
+
+                # if osd pod is not running, as this node is tainted
+                # unnecessary to delete operator pod
+                taints = nodes[node_name]
+                if taints:
+                    for taint in taints:
+                        if taint.key.startswith("node.kubernetes.io"):
+                            # pod not running for taint
+                            delete_operator[node_name] = False
+
+            if delete_operator == True:
+                break
+
+        if delete_operator == True:
+            operator_pod = kube.kube_get_pods_by_selector(namespace, "app=rook-ceph-operator", "")
+            if operator_pod and operator_pod[0] and operator_pod[0].status.phase == 'Running':
+                print("delete operator pod")
+                kube.kube_delete_pod(operator_pod[0].metadata.name, namespace, grace_periods_seconds=0)
+
+
+if __name__ == '__main__':
+    osd_audit()
+{{- end -}}
--- a/stx-rook-ceph/stx-rook-ceph/helm-charts/rook-ceph-provisioner/templates/job-ceph-mon-audit.yaml
+++ b/stx-rook-ceph/stx-rook-ceph/helm-charts/rook-ceph-provisioner/templates/job-ceph-mon-audit.yaml
@ -29,9 +29,10 @@ data:
        active=$node
    fi

-    controller_node=$(kubectl get pods -n kube-system  --selector=app="rook-ceph-mon,ceph_daemon_id=a" -o wide | awk   '/controller/  {print $7}')
+    controller_node=$(kubectl get pods -n kube-system  --selector=app="rook-ceph-mon,ceph_daemon_id=a" -o wide | awk   '/Running.*controller/  {print $7}')
    if [ x"$active" = x"$controller_node" ]; then
        echo "mon-a pod is running on active controler"
+
        exit 0
    fi

@ -62,9 +63,10 @@ kind: CronJob
 metadata:
  name: stx-ceph-mon-audit
 spec:
-  schedule: {{ .Values.ceph_mon_audit_jobs.audit.cron | quote }}
-  successfulJobsHistoryLimit: {{ .Values.ceph_mon_audit_jobs.audit.history.success }}
-  failedJobsHistoryLimit: {{ .Values.ceph_mon_audit_jobs.audit.history.failed }}
+  schedule: {{ .Values.ceph_audit_jobs.audit.cron | quote }}
+  startingDeadlineSeconds: {{ .Values.ceph_audit_jobs.audit.deadline }}
+  successfulJobsHistoryLimit: {{ .Values.ceph_audit_jobs.audit.history.success }}
+  failedJobsHistoryLimit: {{ .Values.ceph_audit_jobs.audit.history.failed }}
  concurrencyPolicy: Forbid
  jobTemplate:
    metadata:
@ -104,7 +106,7 @@ spec:
            - name: NAMESPACE
              value: {{ .Release.Namespace }}
            - name: FLOAT_IP
-              value: {{ .Values.ceph_mon_audit_jobs.floatIP }}
+              value: {{ .Values.ceph_audit_jobs.floatIP }}
            volumeMounts:
            - name: platform
              mountPath: /opt/platform
--- a/stx-rook-ceph/stx-rook-ceph/helm-charts/rook-ceph-provisioner/templates/job-ceph-osd-audit.yaml
+++ b/stx-rook-ceph/stx-rook-ceph/helm-charts/rook-ceph-provisioner/templates/job-ceph-osd-audit.yaml
@ -0,0 +1,97 @@
+{{/*
+#
+# Copyright (c) 2019 Wind River Systems, Inc.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+*/}}
+
+{{- if .Values.global.job_ceph_osd_audit }}
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: ceph-osd-audit-bin
+  namespace: {{ .Release.Namespace }}
+data:
+  osd_audit.py: |-
+{{- include "script.osd_audit" . | indent 4 }}
+---
+apiVersion: batch/v1beta1
+kind: CronJob
+metadata:
+  name: stx-ceph-osd-audit
+spec:
+  schedule: {{ .Values.ceph_audit_jobs.audit.cron | quote }}
+  startingDeadlineSeconds: {{ .Values.ceph_audit_jobs.audit.deadline }}
+  successfulJobsHistoryLimit: {{ .Values.ceph_audit_jobs.audit.history.success }}
+  failedJobsHistoryLimit: {{ .Values.ceph_audit_jobs.audit.history.failed }}
+  concurrencyPolicy: Forbid
+  jobTemplate:
+    metadata:
+      name: stx-ceph-osd-audit
+      namespace: {{ .Release.Namespace }}
+      labels:
+        app: ceph-osd-audit
+    spec:
+      template:
+        metadata:
+          labels:
+            app: ceph-osd-audit
+        spec:
+          serviceAccountName: {{ .Values.rbac.serviceAccount }}
+          restartPolicy: OnFailure
+          hostNetwork: true
+          {{- if .Values.global.nodeSelector }}
+          nodeSelector:
+          {{ .Values.global.nodeSelector | toYaml | trim | indent 10 }}
+          {{- end }}
+          volumes:
+          - name: ceph-osd-audit-bin
+            configMap:
+              name: ceph-osd-audit-bin
+              defaultMode: 0555
+          - name: kube-config
+            hostPath:
+              path: /etc/kubernetes/admin.conf
+          - name: config-key-provision
+            configMap:
+              name: {{ .Values.global.configmap_key_init }}
+          - name: ceph-config
+            emptyDir: {}
+          initContainers:
+          - name: init
+            image: {{ .Values.images.tags.ceph_config_helper | quote }}
+            command: [ "/bin/bash", "/tmp/mount/provision.sh" ]
+            env:
+            - name: ADMIN_KEYRING
+              valueFrom:
+                secretKeyRef:
+                  name: rook-ceph-admin-keyring
+                  key: keyring
+            - name: ROOK_MONS
+              valueFrom:
+                configMapKeyRef:
+                  name: rook-ceph-mon-endpoints
+                  key: data
+            volumeMounts:
+            - name: ceph-config
+              mountPath: /etc/ceph
+            - name: config-key-provision
+              mountPath: /tmp/mount
+          containers:
+          - name: ceph-osd-audit
+            image: {{ .Values.images.tags.stx_ceph_manager | quote }}
+            command: [ "python", "/tmp/mount/osd_audit.py" ]
+            env:
+            - name: NAMESPACE
+              value: {{ .Release.Namespace }}
+            volumeMounts:
+            - name: ceph-osd-audit-bin
+              mountPath: /tmp/mount
+            - name: ceph-config
+              mountPath: /etc/ceph
+              readOnly: true
+            - name: kube-config
+              mountPath: /etc/kubernetes/admin.conf
+              readOnly: true
+{{- end }}
--- a/stx-rook-ceph/stx-rook-ceph/helm-charts/rook-ceph-provisioner/values.yaml
+++ b/stx-rook-ceph/stx-rook-ceph/helm-charts/rook-ceph-provisioner/values.yaml
@ -14,6 +14,7 @@ global:
  provision_storage: true
  job_ceph_mgr_provision: true
  job_ceph_mon_audit: false
+  job_ceph_osd_audit: true
  job_host_provision: true
  job_cleanup: true
  deployment_stx_ceph_manager: true
@ -79,10 +80,11 @@ host_provision:
  - controller-0


-ceph_mon_audit_jobs:
+ceph_audit_jobs:
  floatIP: 192.168.204.2
  audit:
    cron: "*/3 * * * *"
+    deadline: 200
    history:
      success: 1
      failed: 1