Add cronjob to osd audit
After host-swact, mon and osd deployment on active controller will be deleted, then drbd could make /var/lib/ceph/mon folder primary and secondary switch on two controllers. Then after swact, a new mon and osd deployment will launch by rook-ceph-operator. But during this process, one controller suddenly shutdown, there will be only osd deployment on power on controller, even the other one controller later power on to recovery. So add a cron job to make osd deployment status check to ensure cluster health. Partial-Bug: 1920882 Depends-On: I4afde8b1476e14453fac8561f1edde7360b8ee96 Change-Id: I39cb66daecf4052821ceb28344a90ea70f63a742 Signed-off-by: Chen, Haochuan Z <haochuan.z.chen@intel.com>
This commit is contained in:
parent
015c8b5b54
commit
5945b9b82a
@ -82,7 +82,7 @@ class RookCephProvisionerHelm(base.BaseHelm):
|
||||
"host_provision": {
|
||||
"controller_hosts": self._get_controller_hosts(),
|
||||
},
|
||||
"ceph_mon_audit_jobs": self._get_ceph_mon_audit(),
|
||||
"ceph_audit_jobs": self._get_ceph_audit(),
|
||||
}
|
||||
}
|
||||
|
||||
@ -149,7 +149,7 @@ class RookCephProvisionerHelm(base.BaseHelm):
|
||||
|
||||
return controller_hosts
|
||||
|
||||
def _get_ceph_mon_audit(self):
|
||||
def _get_ceph_audit(self):
|
||||
audit = {}
|
||||
|
||||
if utils.is_aio_duplex_system(self.dbapi):
|
||||
|
@ -10,11 +10,12 @@ stxdir = {toxinidir}/../../..
|
||||
distshare={toxworkdir}/.tox/distshare
|
||||
|
||||
[testenv]
|
||||
# usedevelop = True
|
||||
# enabling usedevelop results in py27 develop-inst:
|
||||
# Exception: Versioning for this project requires either an sdist tarball,
|
||||
# or access to an upstream git repository.
|
||||
# Note. site-packages is true and rpm-python must be yum installed on your dev machine.
|
||||
usedevelop = True
|
||||
basepython = python3
|
||||
sitepackages = True
|
||||
|
||||
# tox is silly... these need to be separated by a newline....
|
||||
@ -99,7 +100,7 @@ deps = -r{toxinidir}/test-requirements.txt
|
||||
commands = bandit --ini tox.ini -n 5 -r k8sapp_rook
|
||||
|
||||
[testenv:pylint]
|
||||
basepython = python2.7
|
||||
basepython = python3
|
||||
sitepackages = False
|
||||
deps = {[testenv]deps}
|
||||
commands =
|
||||
|
@ -0,0 +1,196 @@
|
||||
{{- define "script.osd_audit" -}}
|
||||
#!/usr/bin/env python
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
from cephclient import wrapper
|
||||
from kubernetes import config
|
||||
from kubernetes import client
|
||||
from kubernetes.client import Configuration
|
||||
from kubernetes.client.rest import ApiException
|
||||
from six.moves import http_client as httplib
|
||||
from cephclient import wrapper
|
||||
|
||||
# Kubernetes Files
|
||||
KUBERNETES_ADMIN_CONF = '/etc/kubernetes/admin.conf'
|
||||
|
||||
CEPH_MGR_PORT = 7999
|
||||
|
||||
def is_k8s_configured():
|
||||
"""Check to see if the k8s admin config file exists."""
|
||||
if os.path.isfile(KUBERNETES_ADMIN_CONF):
|
||||
return True
|
||||
return False
|
||||
|
||||
class KubeOperator(object):
|
||||
|
||||
def __init__(self):
|
||||
self._kube_client_batch = None
|
||||
self._kube_client_core = None
|
||||
self._kube_client_custom_objects = None
|
||||
|
||||
def _load_kube_config(self):
|
||||
if not is_k8s_configured():
|
||||
raise exception.KubeNotConfigured()
|
||||
|
||||
config.load_kube_config(KUBERNETES_ADMIN_CONF)
|
||||
|
||||
# Workaround: Turn off SSL/TLS verification
|
||||
c = Configuration()
|
||||
c.verify_ssl = False
|
||||
Configuration.set_default(c)
|
||||
|
||||
def _get_kubernetesclient_core(self):
|
||||
if not self._kube_client_core:
|
||||
self._load_kube_config()
|
||||
self._kube_client_core = client.CoreV1Api()
|
||||
return self._kube_client_core
|
||||
|
||||
def _get_kubernetesclient_custom_objects(self):
|
||||
if not self._kube_client_custom_objects:
|
||||
self._load_kube_config()
|
||||
self._kube_client_custom_objects = client.CustomObjectsApi()
|
||||
return self._kube_client_custom_objects
|
||||
|
||||
def kube_get_nodes(self):
|
||||
try:
|
||||
api_response = self._get_kubernetesclient_core().list_node()
|
||||
return api_response.items
|
||||
except ApiException as e:
|
||||
print("Kubernetes exception in kube_get_nodes: %s" % e)
|
||||
raise
|
||||
|
||||
def kube_get_pods_by_selector(self, namespace, label_selector,
|
||||
field_selector):
|
||||
c = self._get_kubernetesclient_core()
|
||||
try:
|
||||
api_response = c.list_namespaced_pod(namespace,
|
||||
label_selector="%s" % label_selector,
|
||||
field_selector="%s" % field_selector)
|
||||
return api_response.items
|
||||
except ApiException as e:
|
||||
print("Kubernetes exception in "
|
||||
"kube_get_pods_by_selector %s/%s/%s: %s",
|
||||
namespace, label_selector, field_selector, e)
|
||||
|
||||
return None
|
||||
|
||||
def kube_delete_pod(self, name, namespace, **kwargs):
|
||||
body = {}
|
||||
|
||||
if kwargs:
|
||||
body.update(kwargs)
|
||||
|
||||
c = self._get_kubernetesclient_core()
|
||||
try:
|
||||
api_response = c.delete_namespaced_pod(name, namespace, body)
|
||||
return True
|
||||
except ApiException as e:
|
||||
if e.status == httplib.NOT_FOUND:
|
||||
print("Pod %s/%s not found." % (namespace, name))
|
||||
return False
|
||||
else:
|
||||
print("Failed to delete Pod %s/%s: " "%s" % (namespace, name, e.body))
|
||||
raise
|
||||
|
||||
def get_custom_resource(self, group, version, namespace, plural, name):
|
||||
c = self._get_kubernetesclient_custom_objects()
|
||||
|
||||
try:
|
||||
api_response = c.list_namespaced_custom_object(group, version, namespace,
|
||||
plural)
|
||||
return api_response
|
||||
except ApiException as ex:
|
||||
if ex.reason == "Not Found":
|
||||
print("Failed to delete custom object, Namespace %s: %s" % (namespace, str(ex.body).replace('\n', ' ')))
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
def osd_audit():
|
||||
kube = KubeOperator()
|
||||
group = "ceph.rook.io"
|
||||
version = "v1"
|
||||
namespace = "kube-system"
|
||||
plural = "cephclusters"
|
||||
name = "cephclusters.ceph.rook.io.ceph-cluster"
|
||||
|
||||
try:
|
||||
ceph_api = wrapper.CephWrapper(endpoint='http://localhost:{}'.format(CEPH_MGR_PORT))
|
||||
response, body = ceph_api.health(body='text', timeout=30)
|
||||
if body == "HEALTH_OK":
|
||||
print("Cluster reports HEALTH_OK")
|
||||
return
|
||||
print(body)
|
||||
except IOError as e:
|
||||
print("Accessing Ceph API failed. Cluster health unknown. Proceeding.")
|
||||
pass
|
||||
|
||||
cluster = {}
|
||||
try:
|
||||
cephcluster = kube.get_custom_resource(group, version, namespace, plural, name)
|
||||
if 'items' in cephcluster:
|
||||
cluster = cephcluster['items'][0]
|
||||
except ApiException as ex:
|
||||
if ex.reason == "Not Found":
|
||||
print("Failed to delete custom object, Namespace %s: %s" % (namespace, str(ex.body).replace('\n', ' ')))
|
||||
pass
|
||||
|
||||
health = ""
|
||||
if cluster and cluster.has_key("status") and cluster["status"].has_key("ceph") and cluster['status']['ceph'].has_key("health"):
|
||||
health = cluster['status']['ceph']['health']
|
||||
else:
|
||||
print("Failed to get cluster['status']['ceph']['health']")
|
||||
return
|
||||
|
||||
if health != "HEALTH_OK":
|
||||
delete_operator = False
|
||||
osd_nodes = cluster['spec']['storage']['nodes']
|
||||
nodes = {}
|
||||
|
||||
node_list = kube.kube_get_nodes()
|
||||
for item in node_list:
|
||||
nodes[item.metadata.name] = item.spec.taints
|
||||
|
||||
for n in osd_nodes:
|
||||
# get osd info declare in ceph cluster
|
||||
node_name = n['name']
|
||||
osd_devices = n['devices']
|
||||
|
||||
# check whether there is osd pod running described in cephcluster osd_nodes
|
||||
label = "app=rook-ceph-osd,failure-domain=%s" % node_name
|
||||
pods = kube.kube_get_pods_by_selector(namespace, label, "")
|
||||
|
||||
osd_pods = []
|
||||
for pod in pods:
|
||||
if pod.status.phase == 'Running':
|
||||
osd_pods.append(pod)
|
||||
|
||||
if len(osd_devices) != len(osd_pods) :
|
||||
# assume when osd pod number is not equal with this node osd device
|
||||
# operator should reset
|
||||
delete_operator = True
|
||||
|
||||
# if osd pod is not running, as this node is tainted
|
||||
# unnecessary to delete operator pod
|
||||
taints = nodes[node_name]
|
||||
if taints:
|
||||
for taint in taints:
|
||||
if taint.key.startswith("node.kubernetes.io"):
|
||||
# pod not running for taint
|
||||
delete_operator[node_name] = False
|
||||
|
||||
if delete_operator == True:
|
||||
break
|
||||
|
||||
if delete_operator == True:
|
||||
operator_pod = kube.kube_get_pods_by_selector(namespace, "app=rook-ceph-operator", "")
|
||||
if operator_pod and operator_pod[0] and operator_pod[0].status.phase == 'Running':
|
||||
print("delete operator pod")
|
||||
kube.kube_delete_pod(operator_pod[0].metadata.name, namespace, grace_periods_seconds=0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
osd_audit()
|
||||
{{- end -}}
|
@ -29,9 +29,10 @@ data:
|
||||
active=$node
|
||||
fi
|
||||
|
||||
controller_node=$(kubectl get pods -n kube-system --selector=app="rook-ceph-mon,ceph_daemon_id=a" -o wide | awk '/controller/ {print $7}')
|
||||
controller_node=$(kubectl get pods -n kube-system --selector=app="rook-ceph-mon,ceph_daemon_id=a" -o wide | awk '/Running.*controller/ {print $7}')
|
||||
if [ x"$active" = x"$controller_node" ]; then
|
||||
echo "mon-a pod is running on active controler"
|
||||
|
||||
exit 0
|
||||
fi
|
||||
|
||||
@ -62,9 +63,10 @@ kind: CronJob
|
||||
metadata:
|
||||
name: stx-ceph-mon-audit
|
||||
spec:
|
||||
schedule: {{ .Values.ceph_mon_audit_jobs.audit.cron | quote }}
|
||||
successfulJobsHistoryLimit: {{ .Values.ceph_mon_audit_jobs.audit.history.success }}
|
||||
failedJobsHistoryLimit: {{ .Values.ceph_mon_audit_jobs.audit.history.failed }}
|
||||
schedule: {{ .Values.ceph_audit_jobs.audit.cron | quote }}
|
||||
startingDeadlineSeconds: {{ .Values.ceph_audit_jobs.audit.deadline }}
|
||||
successfulJobsHistoryLimit: {{ .Values.ceph_audit_jobs.audit.history.success }}
|
||||
failedJobsHistoryLimit: {{ .Values.ceph_audit_jobs.audit.history.failed }}
|
||||
concurrencyPolicy: Forbid
|
||||
jobTemplate:
|
||||
metadata:
|
||||
@ -104,7 +106,7 @@ spec:
|
||||
- name: NAMESPACE
|
||||
value: {{ .Release.Namespace }}
|
||||
- name: FLOAT_IP
|
||||
value: {{ .Values.ceph_mon_audit_jobs.floatIP }}
|
||||
value: {{ .Values.ceph_audit_jobs.floatIP }}
|
||||
volumeMounts:
|
||||
- name: platform
|
||||
mountPath: /opt/platform
|
||||
|
@ -0,0 +1,97 @@
|
||||
{{/*
|
||||
#
|
||||
# Copyright (c) 2019 Wind River Systems, Inc.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
*/}}
|
||||
|
||||
{{- if .Values.global.job_ceph_osd_audit }}
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: ceph-osd-audit-bin
|
||||
namespace: {{ .Release.Namespace }}
|
||||
data:
|
||||
osd_audit.py: |-
|
||||
{{- include "script.osd_audit" . | indent 4 }}
|
||||
---
|
||||
apiVersion: batch/v1beta1
|
||||
kind: CronJob
|
||||
metadata:
|
||||
name: stx-ceph-osd-audit
|
||||
spec:
|
||||
schedule: {{ .Values.ceph_audit_jobs.audit.cron | quote }}
|
||||
startingDeadlineSeconds: {{ .Values.ceph_audit_jobs.audit.deadline }}
|
||||
successfulJobsHistoryLimit: {{ .Values.ceph_audit_jobs.audit.history.success }}
|
||||
failedJobsHistoryLimit: {{ .Values.ceph_audit_jobs.audit.history.failed }}
|
||||
concurrencyPolicy: Forbid
|
||||
jobTemplate:
|
||||
metadata:
|
||||
name: stx-ceph-osd-audit
|
||||
namespace: {{ .Release.Namespace }}
|
||||
labels:
|
||||
app: ceph-osd-audit
|
||||
spec:
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: ceph-osd-audit
|
||||
spec:
|
||||
serviceAccountName: {{ .Values.rbac.serviceAccount }}
|
||||
restartPolicy: OnFailure
|
||||
hostNetwork: true
|
||||
{{- if .Values.global.nodeSelector }}
|
||||
nodeSelector:
|
||||
{{ .Values.global.nodeSelector | toYaml | trim | indent 10 }}
|
||||
{{- end }}
|
||||
volumes:
|
||||
- name: ceph-osd-audit-bin
|
||||
configMap:
|
||||
name: ceph-osd-audit-bin
|
||||
defaultMode: 0555
|
||||
- name: kube-config
|
||||
hostPath:
|
||||
path: /etc/kubernetes/admin.conf
|
||||
- name: config-key-provision
|
||||
configMap:
|
||||
name: {{ .Values.global.configmap_key_init }}
|
||||
- name: ceph-config
|
||||
emptyDir: {}
|
||||
initContainers:
|
||||
- name: init
|
||||
image: {{ .Values.images.tags.ceph_config_helper | quote }}
|
||||
command: [ "/bin/bash", "/tmp/mount/provision.sh" ]
|
||||
env:
|
||||
- name: ADMIN_KEYRING
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: rook-ceph-admin-keyring
|
||||
key: keyring
|
||||
- name: ROOK_MONS
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
name: rook-ceph-mon-endpoints
|
||||
key: data
|
||||
volumeMounts:
|
||||
- name: ceph-config
|
||||
mountPath: /etc/ceph
|
||||
- name: config-key-provision
|
||||
mountPath: /tmp/mount
|
||||
containers:
|
||||
- name: ceph-osd-audit
|
||||
image: {{ .Values.images.tags.stx_ceph_manager | quote }}
|
||||
command: [ "python", "/tmp/mount/osd_audit.py" ]
|
||||
env:
|
||||
- name: NAMESPACE
|
||||
value: {{ .Release.Namespace }}
|
||||
volumeMounts:
|
||||
- name: ceph-osd-audit-bin
|
||||
mountPath: /tmp/mount
|
||||
- name: ceph-config
|
||||
mountPath: /etc/ceph
|
||||
readOnly: true
|
||||
- name: kube-config
|
||||
mountPath: /etc/kubernetes/admin.conf
|
||||
readOnly: true
|
||||
{{- end }}
|
@ -14,6 +14,7 @@ global:
|
||||
provision_storage: true
|
||||
job_ceph_mgr_provision: true
|
||||
job_ceph_mon_audit: false
|
||||
job_ceph_osd_audit: true
|
||||
job_host_provision: true
|
||||
job_cleanup: true
|
||||
deployment_stx_ceph_manager: true
|
||||
@ -79,10 +80,11 @@ host_provision:
|
||||
- controller-0
|
||||
|
||||
|
||||
ceph_mon_audit_jobs:
|
||||
ceph_audit_jobs:
|
||||
floatIP: 192.168.204.2
|
||||
audit:
|
||||
cron: "*/3 * * * *"
|
||||
deadline: 200
|
||||
history:
|
||||
success: 1
|
||||
failed: 1
|
||||
|
Loading…
x
Reference in New Issue
Block a user