From 9e5fea6e18a846e38c9f662925aa8cd519a89e00 Mon Sep 17 00:00:00 2001 From: Vasyl Saienko Date: Mon, 16 Sep 2024 05:39:48 +0000 Subject: [PATCH] [mariadb] Add cluster wait job Add job that waits when initial bootstrapping of cluster is completed which is required to pause db creation and initialization when cluster is not fully bootstrapped. Change-Id: I705df1a1b1a34f464dc36a36dd7964f8a7bf72d9 --- mariadb/Chart.yaml | 2 +- .../bin/_mariadb-wait-for-cluster.py.tpl | 190 ++++++++++++++++++ mariadb/templates/configmap-bin.yaml | 2 + mariadb/templates/job-cluster-wait.yaml | 123 ++++++++++++ mariadb/values.yaml | 20 ++ releasenotes/notes/mariadb.yaml | 1 + 6 files changed, 337 insertions(+), 1 deletion(-) create mode 100644 mariadb/templates/bin/_mariadb-wait-for-cluster.py.tpl create mode 100644 mariadb/templates/job-cluster-wait.yaml diff --git a/mariadb/Chart.yaml b/mariadb/Chart.yaml index b4b34f5fe..5552339f0 100644 --- a/mariadb/Chart.yaml +++ b/mariadb/Chart.yaml @@ -15,7 +15,7 @@ apiVersion: v1 appVersion: v10.6.7 description: OpenStack-Helm MariaDB name: mariadb -version: 0.2.49 +version: 0.2.50 home: https://mariadb.com/kb/en/ icon: http://badges.mariadb.org/mariadb-badge-180x60.png sources: diff --git a/mariadb/templates/bin/_mariadb-wait-for-cluster.py.tpl b/mariadb/templates/bin/_mariadb-wait-for-cluster.py.tpl new file mode 100644 index 000000000..fb36e271d --- /dev/null +++ b/mariadb/templates/bin/_mariadb-wait-for-cluster.py.tpl @@ -0,0 +1,190 @@ +#!/usr/bin/env python3 + +import datetime +from enum import Enum +import logging +import os +import sys +import time + +import pymysql +import pykube + +MARIADB_HOST = os.getenv("MARIADB_HOST") +MARIADB_PASSWORD = os.getenv("MARIADB_PASSWORD") +MARIADB_REPLICAS = os.getenv("MARIADB_REPLICAS") + +MARIADB_CLUSTER_STATE_LOG_LEVEL = os.getenv("MARIADB_CLUSTER_STATE_LOG_LEVEL", "INFO") + +MARIADB_CLUSTER_STABILITY_COUNT = int( + os.getenv("MARIADB_CLUSTER_STABILITY_COUNT", "30") +) +MARIADB_CLUSTER_STABILITY_WAIT = int(os.getenv("MARIADB_CLUSTER_STABILITY_WAIT", "4")) +MARIADB_CLUSTER_CHECK_WAIT = int(os.getenv("MARIADB_CLUSTER_CHECK_WAIT", "30")) + +MARIADB_CLUSTER_STATE_CONFIGMAP = os.getenv("MARIADB_CLUSTER_STATE_CONFIGMAP") +MARIADB_CLUSTER_STATE_CONFIGMAP_NAMESPACE = os.getenv( + "MARIADB_CLUSTER_STATE_CONFIGMAP_NAMESPACE", "openstack" +) +MARIADB_CLUSTER_STATE_PYKUBE_REQUEST_TIMEOUT = int( + os.getenv("MARIADB_CLUSTER_STATE_PYKUBE_REQUEST_TIMEOUT", 60) +) + +log_level = MARIADB_CLUSTER_STATE_LOG_LEVEL +logging.basicConfig( + stream=sys.stdout, + format="%(asctime)s %(levelname)s %(name)s %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", +) +LOG = logging.getLogger("mariadb-cluster-wait") +LOG.setLevel(log_level) + + +def login(): + config = pykube.KubeConfig.from_env() + client = pykube.HTTPClient( + config=config, timeout=MARIADB_CLUSTER_STATE_PYKUBE_REQUEST_TIMEOUT + ) + LOG.info(f"Created k8s api client from context {config.current_context}") + return client + + +api = login() +cluster_state_map = ( + pykube.ConfigMap.objects(api) + .filter(namespace=MARIADB_CLUSTER_STATE_CONFIGMAP_NAMESPACE) + .get_by_name(MARIADB_CLUSTER_STATE_CONFIGMAP) +) + + +def get_current_state(cluster_state_map): + cluster_state_map.get( + MARIADB_CLUSTER_STATE_INITIAL_BOOTSTRAP_COMPLETED_KEY, "False" + ) + + +def retry(times, exceptions): + def decorator(func): + def newfn(*args, **kwargs): + attempt = 0 + while attempt < times: + try: + return func(*args, **kwargs) + except exceptions: + attempt += 1 + LOG.exception( + f"Exception thrown when attempting to run {func}, attempt {attempt} of {times}" + ) + return func(*args, **kwargs) + return newfn + return decorator + + +class initalClusterState: + + initial_state_key = "initial-bootstrap-completed.cluster" + + @retry(times=100, exceptions=(Exception)) + def __init__(self, api, namespace, name): + self.namespace = namespace + self.name = name + self.cm = ( + pykube.ConfigMap.objects(api) + .filter(namespace=self.namespace) + .get_by_name(self.name) + ) + + def get_default(self): + """We have deployments with completed job, but it is not reflected + in the configmap state. Assume when configmap is created more than + 1h and we doing update/restart, and key not in map this is + existed environment. So we assume the cluster was initialy bootstrapped. + This is needed to avoid manual actions. + """ + now = datetime.datetime.utcnow() + created_at = datetime.datetime.strptime( + self.cm.obj["metadata"]["creationTimestamp"], "%Y-%m-%dT%H:%M:%SZ" + ) + delta = datetime.timedelta(seconds=3600) + + if now - created_at > delta: + self.complete() + return "COMPLETED" + return "NOT_COMPLETED" + + @property + @retry(times=10, exceptions=(Exception)) + def is_completed(self): + + self.cm.reload() + if self.initial_state_key in self.cm.obj["data"]: + return self.cm.obj["data"][self.initial_state_key] + + return self.get_default() == "COMPLETED" + + @retry(times=100, exceptions=(Exception)) + def complete(self): + patch = {"data": {self.initial_state_key: "COMPLETED"}} + self.cm.patch(patch) + + +ics = initalClusterState( + api, MARIADB_CLUSTER_STATE_CONFIGMAP_NAMESPACE, MARIADB_CLUSTER_STATE_CONFIGMAP +) + +if ics.is_completed: + LOG.info("The initial bootstrap was completed, skipping wait...") + sys.exit(0) + +LOG.info("Checking for mariadb cluster state.") + + +def is_mariadb_stabe(): + try: + wsrep_OK = { + "wsrep_ready": "ON", + "wsrep_connected": "ON", + "wsrep_cluster_status": "Primary", + "wsrep_local_state_comment": "Synced", + "wsrep_cluster_size": str(MARIADB_REPLICAS), + } + wsrep_vars = ",".join(["'" + var + "'" for var in wsrep_OK.keys()]) + db_cursor = pymysql.connect( + host=MARIADB_HOST, password=MARIADB_PASSWORD, + read_default_file="/etc/mysql/admin_user.cnf" + ).cursor() + db_cursor.execute(f"SHOW GLOBAL STATUS WHERE Variable_name IN ({wsrep_vars})") + wsrep_vars = db_cursor.fetchall() + diff = set(wsrep_vars).difference(set(wsrep_OK.items())) + if diff: + LOG.error(f"The wsrep is not OK: {diff}") + else: + LOG.info("The wspep is ready") + return True + except Exception as e: + LOG.error(f"Got exception while checking state. {e}") + return False + + +count = 0 +ready = False +stable_for = 1 + +while True: + if is_mariadb_stabe(): + stable_for += 1 + LOG.info( + f"The cluster is stable for {stable_for} out of {MARIADB_CLUSTER_STABILITY_COUNT}" + ) + if stable_for == MARIADB_CLUSTER_STABILITY_COUNT: + ics.complete() + sys.exit(0) + else: + LOG.info(f"Sleeping for {MARIADB_CLUSTER_STABILITY_WAIT}") + time.sleep(MARIADB_CLUSTER_STABILITY_WAIT) + continue + else: + LOG.info("Resetting stable_for count.") + stable_for = 0 + LOG.info(f"Sleeping for {MARIADB_CLUSTER_CHECK_WAIT}") + time.sleep(MARIADB_CLUSTER_CHECK_WAIT) diff --git a/mariadb/templates/configmap-bin.yaml b/mariadb/templates/configmap-bin.yaml index 7b6e18ab2..991d83d8b 100644 --- a/mariadb/templates/configmap-bin.yaml +++ b/mariadb/templates/configmap-bin.yaml @@ -57,4 +57,6 @@ data: mariadb_controller.py: | {{ tuple "bin/_mariadb_controller.py.tpl" . | include "helm-toolkit.utils.template" | indent 4 }} {{- end }} + mariadb-wait-for-cluster.py: | +{{ tuple "bin/_mariadb-wait-for-cluster.py.tpl" . | include "helm-toolkit.utils.template" | indent 4 }} {{- end }} diff --git a/mariadb/templates/job-cluster-wait.yaml b/mariadb/templates/job-cluster-wait.yaml new file mode 100644 index 000000000..4a239de3e --- /dev/null +++ b/mariadb/templates/job-cluster-wait.yaml @@ -0,0 +1,123 @@ +{{/* +Copyright 2019 Mirantis inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/}} + +{{- if .Values.manifests.job_cluster_wait }} +{{- $envAll := . }} + +{{- $serviceAccountName := print .Release.Name "-cluster-wait" }} +{{ tuple $envAll "cluster_wait" $serviceAccountName | include "helm-toolkit.snippets.kubernetes_pod_rbac_serviceaccount" }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: {{ $envAll.Release.Name }}-{{ $serviceAccountName }}-pod + namespace: {{ $envAll.Release.Namespace }} +rules: + - apiGroups: + - "" + resources: + - configmaps + verbs: + - update + - patch + - get + - list +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{ $envAll.Release.Name }}-{{ $serviceAccountName }}-pod + namespace: {{ $envAll.Release.Namespace }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: {{ $envAll.Release.Name }}-{{ $serviceAccountName }}-pod +subjects: + - kind: ServiceAccount + name: {{ $serviceAccountName }} + namespace: {{ $envAll.Release.Namespace }} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: "{{.Release.Name}}-cluster-wait" + labels: +{{ tuple $envAll "mariadb" "cluster-wait" | include "helm-toolkit.snippets.kubernetes_metadata_labels" | indent 4 }} + annotations: + {{ tuple $envAll | include "helm-toolkit.snippets.release_uuid" }} +spec: + backoffLimit: {{ .Values.jobs.cluster_wait.clusterCheckRetries }} + template: + metadata: + labels: +{{ tuple $envAll "mariadb" "cluster-wait" | include "helm-toolkit.snippets.kubernetes_metadata_labels" | indent 8 }} + spec: +{{ dict "envAll" $envAll "application" "cluster_wait" | include "helm-toolkit.snippets.kubernetes_pod_security_context" | indent 6 }} + serviceAccountName: {{ $serviceAccountName }} + restartPolicy: OnFailure + nodeSelector: + {{ .Values.labels.job.node_selector_key }}: {{ .Values.labels.job.node_selector_value }} + initContainers: +{{ tuple $envAll "cluster_wait" list | include "helm-toolkit.snippets.kubernetes_entrypoint_init_container" | indent 8 }} + containers: + - name: {{.Release.Name}}-mariadb-cluster-wait +{{ tuple $envAll "mariadb_scripted_test" | include "helm-toolkit.snippets.image" | indent 10 }} +{{ dict "envAll" $envAll "application" "cluster_wait" "container" "mariadb_cluster_wait" | include "helm-toolkit.snippets.kubernetes_container_security_context" | indent 10 }} + env: + - name: MARIADB_HOST + value: {{ tuple "oslo_db" "internal" $envAll | include "helm-toolkit.endpoints.endpoint_host_lookup" }} + - name: MARIADB_REPLICAS + value: {{ .Values.pod.replicas.server | quote }} + - name: MARIADB_CLUSTER_CHECK_WAIT + value: {{ .Values.jobs.cluster_wait.clusterCheckWait | quote }} + - name: MARIADB_CLUSTER_STABILITY_COUNT + value: {{ .Values.jobs.cluster_wait.clusterStabilityCount | quote }} + - name: MARIADB_CLUSTER_STABILITY_WAIT + value: {{ .Values.jobs.cluster_wait.clusterStabilityWait | quote }} + - name: MARIADB_CLUSTER_STATE_CONFIGMAP + value: {{ printf "%s-%s" .Release.Name "mariadb-state" | quote }} + - name: MARIADB_CLUSTER_STATE_CONFIGMAP_NAMESPACE + value: {{ $envAll.Release.Namespace }} + - name: MARIADB_PASSWORD + valueFrom: + secretKeyRef: + name: mariadb-dbadmin-password + key: MYSQL_DBADMIN_PASSWORD + command: + - /tmp/mariadb-wait-for-cluster.py + volumeMounts: + - name: pod-tmp + mountPath: /tmp + - name: mariadb-bin + mountPath: /tmp/mariadb-wait-for-cluster.py + subPath: mariadb-wait-for-cluster.py + readOnly: true + - name: mariadb-secrets + mountPath: /etc/mysql/admin_user.cnf + subPath: admin_user.cnf + readOnly: true + volumes: + - name: pod-tmp + emptyDir: {} + - name: mariadb-bin + configMap: + name: mariadb-bin + defaultMode: 0555 + - name: mariadb-secrets + secret: + secretName: mariadb-secrets + defaultMode: 0444 +{{- end }} diff --git a/mariadb/values.yaml b/mariadb/values.yaml index 68e4488d2..7051a1125 100644 --- a/mariadb/values.yaml +++ b/mariadb/values.yaml @@ -130,6 +130,16 @@ pod: controller: allowPrivilegeEscalation: false readOnlyRootFilesystem: true + cluster_wait: + pod: + runAsUser: 65534 + runAsNonRoot: true + container: + mariadb_cluster_wait: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL affinity: anti: type: @@ -238,6 +248,10 @@ dependencies: service: oslo_db controller: services: null + cluster_wait: + services: + - endpoint: internal + service: oslo_db volume: # this value is used for single pod deployments of mariadb to prevent losing all data # if the pod is restarted @@ -254,6 +268,11 @@ volume: size: 5Gi jobs: + cluster_wait: + clusterCheckWait: 30 + clusterCheckRetries: 30 + clusterStabilityCount: 30 + clusterStabilityWait: 4 exporter_create_sql_user: backoffLimit: 87600 activeDeadlineSeconds: 3600 @@ -672,4 +691,5 @@ manifests: statefulset: true deployment_controller: true service_master: true + job_cluster_wait: false ... diff --git a/releasenotes/notes/mariadb.yaml b/releasenotes/notes/mariadb.yaml index 00f0dcc14..3ba0b73eb 100644 --- a/releasenotes/notes/mariadb.yaml +++ b/releasenotes/notes/mariadb.yaml @@ -65,4 +65,5 @@ mariadb: - 0.2.47 Deploy exporter as sidecar - 0.2.48 Switch to mariadb controller deployment - 0.2.49 Remove ingress deployment + - 0.2.50 Add cluster-wait job ...