Add cronjob to osd audit

After host-swact, mon and osd deployment on active controller will
be deleted, then drbd could make /var/lib/ceph/mon folder primary
and secondary switch on two controllers. Then after swact, a new mon
and osd deployment will launch by rook-ceph-operator. But during this
process, one controller suddenly shutdown, there will be only osd
deployment on power on controller, even the other one controller later
power on to recovery. So add a cron job to make osd deployment status
check to ensure cluster health.

Partial-Bug: 1920882

Depends-On: I4afde8b1476e14453fac8561f1edde7360b8ee96

Change-Id: I39cb66daecf4052821ceb28344a90ea70f63a742
Signed-off-by: Chen, Haochuan Z <haochuan.z.chen@intel.com>
This commit is contained in:
Chen, Haochuan Z 2021-03-29 08:16:49 +08:00
parent 015c8b5b54
commit 5945b9b82a
6 changed files with 308 additions and 10 deletions

View File

@ -82,7 +82,7 @@ class RookCephProvisionerHelm(base.BaseHelm):
"host_provision": {
"controller_hosts": self._get_controller_hosts(),
},
"ceph_mon_audit_jobs": self._get_ceph_mon_audit(),
"ceph_audit_jobs": self._get_ceph_audit(),
}
}
@ -149,7 +149,7 @@ class RookCephProvisionerHelm(base.BaseHelm):
return controller_hosts
def _get_ceph_mon_audit(self):
def _get_ceph_audit(self):
audit = {}
if utils.is_aio_duplex_system(self.dbapi):

View File

@ -10,11 +10,12 @@ stxdir = {toxinidir}/../../..
distshare={toxworkdir}/.tox/distshare
[testenv]
# usedevelop = True
# enabling usedevelop results in py27 develop-inst:
# Exception: Versioning for this project requires either an sdist tarball,
# or access to an upstream git repository.
# Note. site-packages is true and rpm-python must be yum installed on your dev machine.
usedevelop = True
basepython = python3
sitepackages = True
# tox is silly... these need to be separated by a newline....
@ -99,7 +100,7 @@ deps = -r{toxinidir}/test-requirements.txt
commands = bandit --ini tox.ini -n 5 -r k8sapp_rook
[testenv:pylint]
basepython = python2.7
basepython = python3
sitepackages = False
deps = {[testenv]deps}
commands =

View File

@ -0,0 +1,196 @@
{{- define "script.osd_audit" -}}
#!/usr/bin/env python
import os
import subprocess
from cephclient import wrapper
from kubernetes import config
from kubernetes import client
from kubernetes.client import Configuration
from kubernetes.client.rest import ApiException
from six.moves import http_client as httplib
from cephclient import wrapper
# Kubernetes Files
KUBERNETES_ADMIN_CONF = '/etc/kubernetes/admin.conf'
CEPH_MGR_PORT = 7999
def is_k8s_configured():
"""Check to see if the k8s admin config file exists."""
if os.path.isfile(KUBERNETES_ADMIN_CONF):
return True
return False
class KubeOperator(object):
def __init__(self):
self._kube_client_batch = None
self._kube_client_core = None
self._kube_client_custom_objects = None
def _load_kube_config(self):
if not is_k8s_configured():
raise exception.KubeNotConfigured()
config.load_kube_config(KUBERNETES_ADMIN_CONF)
# Workaround: Turn off SSL/TLS verification
c = Configuration()
c.verify_ssl = False
Configuration.set_default(c)
def _get_kubernetesclient_core(self):
if not self._kube_client_core:
self._load_kube_config()
self._kube_client_core = client.CoreV1Api()
return self._kube_client_core
def _get_kubernetesclient_custom_objects(self):
if not self._kube_client_custom_objects:
self._load_kube_config()
self._kube_client_custom_objects = client.CustomObjectsApi()
return self._kube_client_custom_objects
def kube_get_nodes(self):
try:
api_response = self._get_kubernetesclient_core().list_node()
return api_response.items
except ApiException as e:
print("Kubernetes exception in kube_get_nodes: %s" % e)
raise
def kube_get_pods_by_selector(self, namespace, label_selector,
field_selector):
c = self._get_kubernetesclient_core()
try:
api_response = c.list_namespaced_pod(namespace,
label_selector="%s" % label_selector,
field_selector="%s" % field_selector)
return api_response.items
except ApiException as e:
print("Kubernetes exception in "
"kube_get_pods_by_selector %s/%s/%s: %s",
namespace, label_selector, field_selector, e)
return None
def kube_delete_pod(self, name, namespace, **kwargs):
body = {}
if kwargs:
body.update(kwargs)
c = self._get_kubernetesclient_core()
try:
api_response = c.delete_namespaced_pod(name, namespace, body)
return True
except ApiException as e:
if e.status == httplib.NOT_FOUND:
print("Pod %s/%s not found." % (namespace, name))
return False
else:
print("Failed to delete Pod %s/%s: " "%s" % (namespace, name, e.body))
raise
def get_custom_resource(self, group, version, namespace, plural, name):
c = self._get_kubernetesclient_custom_objects()
try:
api_response = c.list_namespaced_custom_object(group, version, namespace,
plural)
return api_response
except ApiException as ex:
if ex.reason == "Not Found":
print("Failed to delete custom object, Namespace %s: %s" % (namespace, str(ex.body).replace('\n', ' ')))
pass
return None
def osd_audit():
kube = KubeOperator()
group = "ceph.rook.io"
version = "v1"
namespace = "kube-system"
plural = "cephclusters"
name = "cephclusters.ceph.rook.io.ceph-cluster"
try:
ceph_api = wrapper.CephWrapper(endpoint='http://localhost:{}'.format(CEPH_MGR_PORT))
response, body = ceph_api.health(body='text', timeout=30)
if body == "HEALTH_OK":
print("Cluster reports HEALTH_OK")
return
print(body)
except IOError as e:
print("Accessing Ceph API failed. Cluster health unknown. Proceeding.")
pass
cluster = {}
try:
cephcluster = kube.get_custom_resource(group, version, namespace, plural, name)
if 'items' in cephcluster:
cluster = cephcluster['items'][0]
except ApiException as ex:
if ex.reason == "Not Found":
print("Failed to delete custom object, Namespace %s: %s" % (namespace, str(ex.body).replace('\n', ' ')))
pass
health = ""
if cluster and cluster.has_key("status") and cluster["status"].has_key("ceph") and cluster['status']['ceph'].has_key("health"):
health = cluster['status']['ceph']['health']
else:
print("Failed to get cluster['status']['ceph']['health']")
return
if health != "HEALTH_OK":
delete_operator = False
osd_nodes = cluster['spec']['storage']['nodes']
nodes = {}
node_list = kube.kube_get_nodes()
for item in node_list:
nodes[item.metadata.name] = item.spec.taints
for n in osd_nodes:
# get osd info declare in ceph cluster
node_name = n['name']
osd_devices = n['devices']
# check whether there is osd pod running described in cephcluster osd_nodes
label = "app=rook-ceph-osd,failure-domain=%s" % node_name
pods = kube.kube_get_pods_by_selector(namespace, label, "")
osd_pods = []
for pod in pods:
if pod.status.phase == 'Running':
osd_pods.append(pod)
if len(osd_devices) != len(osd_pods) :
# assume when osd pod number is not equal with this node osd device
# operator should reset
delete_operator = True
# if osd pod is not running, as this node is tainted
# unnecessary to delete operator pod
taints = nodes[node_name]
if taints:
for taint in taints:
if taint.key.startswith("node.kubernetes.io"):
# pod not running for taint
delete_operator[node_name] = False
if delete_operator == True:
break
if delete_operator == True:
operator_pod = kube.kube_get_pods_by_selector(namespace, "app=rook-ceph-operator", "")
if operator_pod and operator_pod[0] and operator_pod[0].status.phase == 'Running':
print("delete operator pod")
kube.kube_delete_pod(operator_pod[0].metadata.name, namespace, grace_periods_seconds=0)
if __name__ == '__main__':
osd_audit()
{{- end -}}

View File

@ -29,9 +29,10 @@ data:
active=$node
fi
controller_node=$(kubectl get pods -n kube-system --selector=app="rook-ceph-mon,ceph_daemon_id=a" -o wide | awk '/controller/ {print $7}')
controller_node=$(kubectl get pods -n kube-system --selector=app="rook-ceph-mon,ceph_daemon_id=a" -o wide | awk '/Running.*controller/ {print $7}')
if [ x"$active" = x"$controller_node" ]; then
echo "mon-a pod is running on active controler"
exit 0
fi
@ -62,9 +63,10 @@ kind: CronJob
metadata:
name: stx-ceph-mon-audit
spec:
schedule: {{ .Values.ceph_mon_audit_jobs.audit.cron | quote }}
successfulJobsHistoryLimit: {{ .Values.ceph_mon_audit_jobs.audit.history.success }}
failedJobsHistoryLimit: {{ .Values.ceph_mon_audit_jobs.audit.history.failed }}
schedule: {{ .Values.ceph_audit_jobs.audit.cron | quote }}
startingDeadlineSeconds: {{ .Values.ceph_audit_jobs.audit.deadline }}
successfulJobsHistoryLimit: {{ .Values.ceph_audit_jobs.audit.history.success }}
failedJobsHistoryLimit: {{ .Values.ceph_audit_jobs.audit.history.failed }}
concurrencyPolicy: Forbid
jobTemplate:
metadata:
@ -104,7 +106,7 @@ spec:
- name: NAMESPACE
value: {{ .Release.Namespace }}
- name: FLOAT_IP
value: {{ .Values.ceph_mon_audit_jobs.floatIP }}
value: {{ .Values.ceph_audit_jobs.floatIP }}
volumeMounts:
- name: platform
mountPath: /opt/platform

View File

@ -0,0 +1,97 @@
{{/*
#
# Copyright (c) 2019 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
*/}}
{{- if .Values.global.job_ceph_osd_audit }}
apiVersion: v1
kind: ConfigMap
metadata:
name: ceph-osd-audit-bin
namespace: {{ .Release.Namespace }}
data:
osd_audit.py: |-
{{- include "script.osd_audit" . | indent 4 }}
---
apiVersion: batch/v1beta1
kind: CronJob
metadata:
name: stx-ceph-osd-audit
spec:
schedule: {{ .Values.ceph_audit_jobs.audit.cron | quote }}
startingDeadlineSeconds: {{ .Values.ceph_audit_jobs.audit.deadline }}
successfulJobsHistoryLimit: {{ .Values.ceph_audit_jobs.audit.history.success }}
failedJobsHistoryLimit: {{ .Values.ceph_audit_jobs.audit.history.failed }}
concurrencyPolicy: Forbid
jobTemplate:
metadata:
name: stx-ceph-osd-audit
namespace: {{ .Release.Namespace }}
labels:
app: ceph-osd-audit
spec:
template:
metadata:
labels:
app: ceph-osd-audit
spec:
serviceAccountName: {{ .Values.rbac.serviceAccount }}
restartPolicy: OnFailure
hostNetwork: true
{{- if .Values.global.nodeSelector }}
nodeSelector:
{{ .Values.global.nodeSelector | toYaml | trim | indent 10 }}
{{- end }}
volumes:
- name: ceph-osd-audit-bin
configMap:
name: ceph-osd-audit-bin
defaultMode: 0555
- name: kube-config
hostPath:
path: /etc/kubernetes/admin.conf
- name: config-key-provision
configMap:
name: {{ .Values.global.configmap_key_init }}
- name: ceph-config
emptyDir: {}
initContainers:
- name: init
image: {{ .Values.images.tags.ceph_config_helper | quote }}
command: [ "/bin/bash", "/tmp/mount/provision.sh" ]
env:
- name: ADMIN_KEYRING
valueFrom:
secretKeyRef:
name: rook-ceph-admin-keyring
key: keyring
- name: ROOK_MONS
valueFrom:
configMapKeyRef:
name: rook-ceph-mon-endpoints
key: data
volumeMounts:
- name: ceph-config
mountPath: /etc/ceph
- name: config-key-provision
mountPath: /tmp/mount
containers:
- name: ceph-osd-audit
image: {{ .Values.images.tags.stx_ceph_manager | quote }}
command: [ "python", "/tmp/mount/osd_audit.py" ]
env:
- name: NAMESPACE
value: {{ .Release.Namespace }}
volumeMounts:
- name: ceph-osd-audit-bin
mountPath: /tmp/mount
- name: ceph-config
mountPath: /etc/ceph
readOnly: true
- name: kube-config
mountPath: /etc/kubernetes/admin.conf
readOnly: true
{{- end }}

View File

@ -14,6 +14,7 @@ global:
provision_storage: true
job_ceph_mgr_provision: true
job_ceph_mon_audit: false
job_ceph_osd_audit: true
job_host_provision: true
job_cleanup: true
deployment_stx_ceph_manager: true
@ -79,10 +80,11 @@ host_provision:
- controller-0
ceph_mon_audit_jobs:
ceph_audit_jobs:
floatIP: 192.168.204.2
audit:
cron: "*/3 * * * *"
deadline: 200
history:
success: 1
failed: 1