From b0bb8dfa7a2196d484439b5f8aeb5faa029d45c8 Mon Sep 17 00:00:00 2001
From: Oleksii Grudev <ogrudev@mirantis.com>
Date: Thu, 23 Jan 2020 18:45:18 +0200
Subject: [PATCH] Prevent splitbrain during full Galera restart
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch introduces new cluster status "reboot"
which is set by leader node hence other nodes will
start mysql without "--wsrep-new-cluster" option.
Before this following situation took place:

All pods go down one by one with some offset;
First and second nodes have max seqno;
The script on the first node detects there are no active
backends and starts timeout loop;
The script on the second node detects there are no active
backends and starts timeout loop (with approx. 20 sec offset
from first node) ;
Timeout loop finishes on first node, it checks highest seqno
and lowest hostname  and wins the ability to start cluster.
Mysql is started with “--wsrep-new-cluster” parameter.
Seqno is set to “-1” for this node after mysql startup;
Periodic job syncs values from grastate file to configmap;
Timeout loop finishes on second node. It checks node with
highest seqno and lowest hostname and since seqno is already
“-1” for first node, the second node decides that it should
lead the cluster startup and executes mysql with “--wsrep-new-cluster”
option as well which leads to split brain

Change-Id: Ic63fd916289cb05411544cb33d5fdeed1352b380
---
 mariadb/templates/bin/_start.py.tpl | 37 ++++++++++++++++++++++++++---
 1 file changed, 34 insertions(+), 3 deletions(-)

diff --git a/mariadb/templates/bin/_start.py.tpl b/mariadb/templates/bin/_start.py.tpl
index b20d55786c..312ad84efb 100644
--- a/mariadb/templates/bin/_start.py.tpl
+++ b/mariadb/templates/bin/_start.py.tpl
@@ -436,7 +436,8 @@ def get_cluster_state():
                         "openstackhelm.openstack.org/cluster.state": state,
                         "openstackhelm.openstack.org/leader.node": leader,
                         "openstackhelm.openstack.org/leader.expiry":
-                        leader_expiry
+                        leader_expiry,
+                        "openstackhelm.openstack.org/reboot.node": ""
                     }
                 },
                 "data": {}
@@ -685,9 +686,17 @@ def check_if_i_lead():
             "{1}".format(counter, count))
     max_seqno_nodes = get_nodes_with_highest_seqno()
     leader_node = resolve_leader_node(max_seqno_nodes)
-    if local_hostname == leader_node:
-        logger.info("I lead the cluster")
+    if (local_hostname == leader_node and not check_for_active_nodes()
+            and get_cluster_state() == 'live'):
+        logger.info("I lead the cluster. Setting cluster state to reboot.")
+        set_configmap_annotation(
+            key='openstackhelm.openstack.org/cluster.state', value='reboot')
+        set_configmap_annotation(
+            key='openstackhelm.openstack.org/reboot.node', value=local_hostname)
         return True
+    elif local_hostname == leader_node:
+        logger.info("The cluster is already rebooting")
+        return False
     else:
         logger.info("{0} leads the cluster".format(leader_node))
         return False
@@ -866,6 +875,28 @@ elif get_cluster_state() == 'live':
                 while not check_for_active_nodes():
                     time.sleep(default_sleep)
                 run_mysqld()
+elif get_cluster_state() == 'reboot':
+    reboot_node = get_configmap_value(
+        type='annotation', key='openstackhelm.openstack.org/reboot.node')
+    if reboot_node == local_hostname:
+        logger.info(
+        "Cluster reboot procedure wasn`t finished. Trying again.")
+        update_grastate_on_restart()
+        launch_leader_election()
+        launch_cluster_monitor()
+        mysqld_reboot()
+    else:
+        logger.info(
+            "Waiting for the lead node to come online before joining "
+            "it")
+        update_grastate_on_restart()
+        launch_leader_election()
+        launch_cluster_monitor()
+        while not check_for_active_nodes():
+            time.sleep(default_sleep)
+        set_configmap_annotation(
+            key='openstackhelm.openstack.org/cluster.state', value='live')
+        run_mysqld()
 else:
     logger.critical("Dont understand cluster state, exiting with error status")
     sys.exit(1)