diff --git a/mariadb/Chart.yaml b/mariadb/Chart.yaml index b4b34f5fe..5552339f0 100644 --- a/mariadb/Chart.yaml +++ b/mariadb/Chart.yaml @@ -15,7 +15,7 @@ apiVersion: v1 appVersion: v10.6.7 description: OpenStack-Helm MariaDB name: mariadb -version: 0.2.49 +version: 0.2.50 home: https://mariadb.com/kb/en/ icon: http://badges.mariadb.org/mariadb-badge-180x60.png sources: diff --git a/mariadb/templates/bin/_mariadb-wait-for-cluster.py.tpl b/mariadb/templates/bin/_mariadb-wait-for-cluster.py.tpl new file mode 100644 index 000000000..fb36e271d --- /dev/null +++ b/mariadb/templates/bin/_mariadb-wait-for-cluster.py.tpl @@ -0,0 +1,190 @@ +#!/usr/bin/env python3 + +import datetime +from enum import Enum +import logging +import os +import sys +import time + +import pymysql +import pykube + +MARIADB_HOST = os.getenv("MARIADB_HOST") +MARIADB_PASSWORD = os.getenv("MARIADB_PASSWORD") +MARIADB_REPLICAS = os.getenv("MARIADB_REPLICAS") + +MARIADB_CLUSTER_STATE_LOG_LEVEL = os.getenv("MARIADB_CLUSTER_STATE_LOG_LEVEL", "INFO") + +MARIADB_CLUSTER_STABILITY_COUNT = int( + os.getenv("MARIADB_CLUSTER_STABILITY_COUNT", "30") +) +MARIADB_CLUSTER_STABILITY_WAIT = int(os.getenv("MARIADB_CLUSTER_STABILITY_WAIT", "4")) +MARIADB_CLUSTER_CHECK_WAIT = int(os.getenv("MARIADB_CLUSTER_CHECK_WAIT", "30")) + +MARIADB_CLUSTER_STATE_CONFIGMAP = os.getenv("MARIADB_CLUSTER_STATE_CONFIGMAP") +MARIADB_CLUSTER_STATE_CONFIGMAP_NAMESPACE = os.getenv( + "MARIADB_CLUSTER_STATE_CONFIGMAP_NAMESPACE", "openstack" +) +MARIADB_CLUSTER_STATE_PYKUBE_REQUEST_TIMEOUT = int( + os.getenv("MARIADB_CLUSTER_STATE_PYKUBE_REQUEST_TIMEOUT", 60) +) + +log_level = MARIADB_CLUSTER_STATE_LOG_LEVEL +logging.basicConfig( + stream=sys.stdout, + format="%(asctime)s %(levelname)s %(name)s %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", +) +LOG = logging.getLogger("mariadb-cluster-wait") +LOG.setLevel(log_level) + + +def login(): + config = pykube.KubeConfig.from_env() + client = pykube.HTTPClient( + config=config, timeout=MARIADB_CLUSTER_STATE_PYKUBE_REQUEST_TIMEOUT + ) + LOG.info(f"Created k8s api client from context {config.current_context}") + return client + + +api = login() +cluster_state_map = ( + pykube.ConfigMap.objects(api) + .filter(namespace=MARIADB_CLUSTER_STATE_CONFIGMAP_NAMESPACE) + .get_by_name(MARIADB_CLUSTER_STATE_CONFIGMAP) +) + + +def get_current_state(cluster_state_map): + cluster_state_map.get( + MARIADB_CLUSTER_STATE_INITIAL_BOOTSTRAP_COMPLETED_KEY, "False" + ) + + +def retry(times, exceptions): + def decorator(func): + def newfn(*args, **kwargs): + attempt = 0 + while attempt < times: + try: + return func(*args, **kwargs) + except exceptions: + attempt += 1 + LOG.exception( + f"Exception thrown when attempting to run {func}, attempt {attempt} of {times}" + ) + return func(*args, **kwargs) + return newfn + return decorator + + +class initalClusterState: + + initial_state_key = "initial-bootstrap-completed.cluster" + + @retry(times=100, exceptions=(Exception)) + def __init__(self, api, namespace, name): + self.namespace = namespace + self.name = name + self.cm = ( + pykube.ConfigMap.objects(api) + .filter(namespace=self.namespace) + .get_by_name(self.name) + ) + + def get_default(self): + """We have deployments with completed job, but it is not reflected + in the configmap state. Assume when configmap is created more than + 1h and we doing update/restart, and key not in map this is + existed environment. So we assume the cluster was initialy bootstrapped. + This is needed to avoid manual actions. + """ + now = datetime.datetime.utcnow() + created_at = datetime.datetime.strptime( + self.cm.obj["metadata"]["creationTimestamp"], "%Y-%m-%dT%H:%M:%SZ" + ) + delta = datetime.timedelta(seconds=3600) + + if now - created_at > delta: + self.complete() + return "COMPLETED" + return "NOT_COMPLETED" + + @property + @retry(times=10, exceptions=(Exception)) + def is_completed(self): + + self.cm.reload() + if self.initial_state_key in self.cm.obj["data"]: + return self.cm.obj["data"][self.initial_state_key] + + return self.get_default() == "COMPLETED" + + @retry(times=100, exceptions=(Exception)) + def complete(self): + patch = {"data": {self.initial_state_key: "COMPLETED"}} + self.cm.patch(patch) + + +ics = initalClusterState( + api, MARIADB_CLUSTER_STATE_CONFIGMAP_NAMESPACE, MARIADB_CLUSTER_STATE_CONFIGMAP +) + +if ics.is_completed: + LOG.info("The initial bootstrap was completed, skipping wait...") + sys.exit(0) + +LOG.info("Checking for mariadb cluster state.") + + +def is_mariadb_stabe(): + try: + wsrep_OK = { + "wsrep_ready": "ON", + "wsrep_connected": "ON", + "wsrep_cluster_status": "Primary", + "wsrep_local_state_comment": "Synced", + "wsrep_cluster_size": str(MARIADB_REPLICAS), + } + wsrep_vars = ",".join(["'" + var + "'" for var in wsrep_OK.keys()]) + db_cursor = pymysql.connect( + host=MARIADB_HOST, password=MARIADB_PASSWORD, + read_default_file="/etc/mysql/admin_user.cnf" + ).cursor() + db_cursor.execute(f"SHOW GLOBAL STATUS WHERE Variable_name IN ({wsrep_vars})") + wsrep_vars = db_cursor.fetchall() + diff = set(wsrep_vars).difference(set(wsrep_OK.items())) + if diff: + LOG.error(f"The wsrep is not OK: {diff}") + else: + LOG.info("The wspep is ready") + return True + except Exception as e: + LOG.error(f"Got exception while checking state. {e}") + return False + + +count = 0 +ready = False +stable_for = 1 + +while True: + if is_mariadb_stabe(): + stable_for += 1 + LOG.info( + f"The cluster is stable for {stable_for} out of {MARIADB_CLUSTER_STABILITY_COUNT}" + ) + if stable_for == MARIADB_CLUSTER_STABILITY_COUNT: + ics.complete() + sys.exit(0) + else: + LOG.info(f"Sleeping for {MARIADB_CLUSTER_STABILITY_WAIT}") + time.sleep(MARIADB_CLUSTER_STABILITY_WAIT) + continue + else: + LOG.info("Resetting stable_for count.") + stable_for = 0 + LOG.info(f"Sleeping for {MARIADB_CLUSTER_CHECK_WAIT}") + time.sleep(MARIADB_CLUSTER_CHECK_WAIT) diff --git a/mariadb/templates/configmap-bin.yaml b/mariadb/templates/configmap-bin.yaml index 7b6e18ab2..991d83d8b 100644 --- a/mariadb/templates/configmap-bin.yaml +++ b/mariadb/templates/configmap-bin.yaml @@ -57,4 +57,6 @@ data: mariadb_controller.py: | {{ tuple "bin/_mariadb_controller.py.tpl" . | include "helm-toolkit.utils.template" | indent 4 }} {{- end }} + mariadb-wait-for-cluster.py: | +{{ tuple "bin/_mariadb-wait-for-cluster.py.tpl" . | include "helm-toolkit.utils.template" | indent 4 }} {{- end }} diff --git a/mariadb/templates/job-cluster-wait.yaml b/mariadb/templates/job-cluster-wait.yaml new file mode 100644 index 000000000..4a239de3e --- /dev/null +++ b/mariadb/templates/job-cluster-wait.yaml @@ -0,0 +1,123 @@ +{{/* +Copyright 2019 Mirantis inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/}} + +{{- if .Values.manifests.job_cluster_wait }} +{{- $envAll := . }} + +{{- $serviceAccountName := print .Release.Name "-cluster-wait" }} +{{ tuple $envAll "cluster_wait" $serviceAccountName | include "helm-toolkit.snippets.kubernetes_pod_rbac_serviceaccount" }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: {{ $envAll.Release.Name }}-{{ $serviceAccountName }}-pod + namespace: {{ $envAll.Release.Namespace }} +rules: + - apiGroups: + - "" + resources: + - configmaps + verbs: + - update + - patch + - get + - list +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{ $envAll.Release.Name }}-{{ $serviceAccountName }}-pod + namespace: {{ $envAll.Release.Namespace }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: {{ $envAll.Release.Name }}-{{ $serviceAccountName }}-pod +subjects: + - kind: ServiceAccount + name: {{ $serviceAccountName }} + namespace: {{ $envAll.Release.Namespace }} +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: "{{.Release.Name}}-cluster-wait" + labels: +{{ tuple $envAll "mariadb" "cluster-wait" | include "helm-toolkit.snippets.kubernetes_metadata_labels" | indent 4 }} + annotations: + {{ tuple $envAll | include "helm-toolkit.snippets.release_uuid" }} +spec: + backoffLimit: {{ .Values.jobs.cluster_wait.clusterCheckRetries }} + template: + metadata: + labels: +{{ tuple $envAll "mariadb" "cluster-wait" | include "helm-toolkit.snippets.kubernetes_metadata_labels" | indent 8 }} + spec: +{{ dict "envAll" $envAll "application" "cluster_wait" | include "helm-toolkit.snippets.kubernetes_pod_security_context" | indent 6 }} + serviceAccountName: {{ $serviceAccountName }} + restartPolicy: OnFailure + nodeSelector: + {{ .Values.labels.job.node_selector_key }}: {{ .Values.labels.job.node_selector_value }} + initContainers: +{{ tuple $envAll "cluster_wait" list | include "helm-toolkit.snippets.kubernetes_entrypoint_init_container" | indent 8 }} + containers: + - name: {{.Release.Name}}-mariadb-cluster-wait +{{ tuple $envAll "mariadb_scripted_test" | include "helm-toolkit.snippets.image" | indent 10 }} +{{ dict "envAll" $envAll "application" "cluster_wait" "container" "mariadb_cluster_wait" | include "helm-toolkit.snippets.kubernetes_container_security_context" | indent 10 }} + env: + - name: MARIADB_HOST + value: {{ tuple "oslo_db" "internal" $envAll | include "helm-toolkit.endpoints.endpoint_host_lookup" }} + - name: MARIADB_REPLICAS + value: {{ .Values.pod.replicas.server | quote }} + - name: MARIADB_CLUSTER_CHECK_WAIT + value: {{ .Values.jobs.cluster_wait.clusterCheckWait | quote }} + - name: MARIADB_CLUSTER_STABILITY_COUNT + value: {{ .Values.jobs.cluster_wait.clusterStabilityCount | quote }} + - name: MARIADB_CLUSTER_STABILITY_WAIT + value: {{ .Values.jobs.cluster_wait.clusterStabilityWait | quote }} + - name: MARIADB_CLUSTER_STATE_CONFIGMAP + value: {{ printf "%s-%s" .Release.Name "mariadb-state" | quote }} + - name: MARIADB_CLUSTER_STATE_CONFIGMAP_NAMESPACE + value: {{ $envAll.Release.Namespace }} + - name: MARIADB_PASSWORD + valueFrom: + secretKeyRef: + name: mariadb-dbadmin-password + key: MYSQL_DBADMIN_PASSWORD + command: + - /tmp/mariadb-wait-for-cluster.py + volumeMounts: + - name: pod-tmp + mountPath: /tmp + - name: mariadb-bin + mountPath: /tmp/mariadb-wait-for-cluster.py + subPath: mariadb-wait-for-cluster.py + readOnly: true + - name: mariadb-secrets + mountPath: /etc/mysql/admin_user.cnf + subPath: admin_user.cnf + readOnly: true + volumes: + - name: pod-tmp + emptyDir: {} + - name: mariadb-bin + configMap: + name: mariadb-bin + defaultMode: 0555 + - name: mariadb-secrets + secret: + secretName: mariadb-secrets + defaultMode: 0444 +{{- end }} diff --git a/mariadb/values.yaml b/mariadb/values.yaml index 68e4488d2..7051a1125 100644 --- a/mariadb/values.yaml +++ b/mariadb/values.yaml @@ -130,6 +130,16 @@ pod: controller: allowPrivilegeEscalation: false readOnlyRootFilesystem: true + cluster_wait: + pod: + runAsUser: 65534 + runAsNonRoot: true + container: + mariadb_cluster_wait: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL affinity: anti: type: @@ -238,6 +248,10 @@ dependencies: service: oslo_db controller: services: null + cluster_wait: + services: + - endpoint: internal + service: oslo_db volume: # this value is used for single pod deployments of mariadb to prevent losing all data # if the pod is restarted @@ -254,6 +268,11 @@ volume: size: 5Gi jobs: + cluster_wait: + clusterCheckWait: 30 + clusterCheckRetries: 30 + clusterStabilityCount: 30 + clusterStabilityWait: 4 exporter_create_sql_user: backoffLimit: 87600 activeDeadlineSeconds: 3600 @@ -672,4 +691,5 @@ manifests: statefulset: true deployment_controller: true service_master: true + job_cluster_wait: false ... diff --git a/releasenotes/notes/mariadb.yaml b/releasenotes/notes/mariadb.yaml index 00f0dcc14..3ba0b73eb 100644 --- a/releasenotes/notes/mariadb.yaml +++ b/releasenotes/notes/mariadb.yaml @@ -65,4 +65,5 @@ mariadb: - 0.2.47 Deploy exporter as sidecar - 0.2.48 Switch to mariadb controller deployment - 0.2.49 Remove ingress deployment + - 0.2.50 Add cluster-wait job ...