test_restart_frr not applied on all controllers

If FRR is restarted on all controllers simultaneously, quorum is lost from pcs cluster perspective (which is not supported) and recovering from that situation may not always work. Change-Id: I7b8dd4deb348c5e33b50e07ee274437ef25f6c78
2024-09-19 13:52:34 +02:00 · 2024-09-19 13:52:34 +02:00 · a6e9824990
commit a6e9824990
parent 9f0d47e870
2 changed files with 29 additions and 2 deletions
--- a/tobiko/tests/faults/ha/cloud_disruptions.py
+++ b/tobiko/tests/faults/ha/cloud_disruptions.py
@ -584,6 +584,10 @@ def restart_service_on_all_nodes(service):
    is running and check the cloud is healthy after they are started again"""
    node_names = tripleo.get_overcloud_nodes_running_service(service)
    nodes = topology.list_openstack_nodes(hostnames=node_names)
+    restart_service_on_nodes(service, nodes)
+
+
+def restart_service_on_nodes(service, nodes):
    for node in nodes:
        sh.stop_systemd_units(service, ssh_client=node.ssh_client)
    for node in nodes:
--- a/tobiko/tests/faults/ha/test_cloud_recovery.py
+++ b/tobiko/tests/faults/ha/test_cloud_recovery.py
@ -15,6 +15,7 @@
 #    under the License.
 from __future__ import absolute_import

+import random
 import typing

 import pytest
@ -29,6 +30,7 @@ from tobiko.openstack import nova as nova_osp
 from tobiko.openstack import octavia
 from tobiko.openstack import topology
 from tobiko.openstack import tests
+from tobiko.shell import sh
 from tobiko.tests.faults.ha import cloud_disruptions
 from tobiko.tripleo import pacemaker
 from tobiko.tripleo import processes
@ -359,8 +361,29 @@ class DisruptTripleoNodesTest(testtools.TestCase):
    @overcloud.skip_unless_ovn_bgp_agent
    def test_restart_frr(self):
        OvercloudHealthCheck.run_before()
-        cloud_disruptions.restart_service_on_all_nodes(
-            topology.get_agent_service_name(neutron.FRR))
+
+        frr_service = topology.get_agent_service_name(neutron.FRR)
+        # restart frr on all computes
+        computes = topology.list_openstack_nodes(group='compute')
+        cloud_disruptions.restart_service_on_nodes(frr_service, computes)
+        # restart frr on all networkers
+        if 'networker' in topology.list_openstack_node_groups():
+            networkers = topology.list_openstack_nodes(group='networker')
+            cloud_disruptions.restart_service_on_nodes(frr_service, networkers)
+        # restart frr on one controller (in order to avoid quorum issues)
+        controller = random.choice(topology.list_openstack_nodes(
+            group='controller'))
+        sh.stop_systemd_units(frr_service, ssh_client=controller.ssh_client)
+        start_time = tobiko.time()
+        if not pacemaker.fencing_deployed():
+            # when not fencing, the test just starts frr on the controller
+            sh.start_systemd_units(
+                frr_service, ssh_client=controller.ssh_client)
+        else:
+            # when fencing, the controller is rebooted automatically
+            cloud_disruptions.check_overcloud_node_uptime(
+                controller.ssh_client, start_time)
+
        OvercloudHealthCheck.run_after()

    def test_restart_neutron(self):