test_restart_frr not applied on all controllers

If FRR is restarted on all controllers simultaneously, quorum is lost from pcs cluster perspective (which is not supported) and recovering from that situation may not always work. Change-Id: I7b8dd4deb348c5e33b50e07ee274437ef25f6c78
2024-09-19 13:52:34 +02:00 · 2024-09-19 13:52:34 +02:00 · a6e9824990
commit a6e9824990
parent 9f0d47e870
2 changed files with 29 additions and 2 deletions
--- a/tobiko/tests/faults/ha/cloud_disruptions.py
+++ b/tobiko/tests/faults/ha/cloud_disruptions.py
@ -584,6 +584,10 @@ def restart_service_on_all_nodes(service):
    is running and check the cloud is healthy after they are started again"""
    node_names = tripleo.get_overcloud_nodes_running_service(service)
    nodes = topology.list_openstack_nodes(hostnames=node_names)
    restart_service_on_nodes(service, nodes)
 def restart_service_on_nodes(service, nodes):
    for node in nodes:
        sh.stop_systemd_units(service, ssh_client=node.ssh_client)
    for node in nodes:
--- a/tobiko/tests/faults/ha/test_cloud_recovery.py
+++ b/tobiko/tests/faults/ha/test_cloud_recovery.py
@ -15,6 +15,7 @@
 #    under the License.
 from __future__ import absolute_import
 import random
 import typing
 import pytest
@ -29,6 +30,7 @@ from tobiko.openstack import nova as nova_osp
 from tobiko.openstack import octavia
 from tobiko.openstack import topology
 from tobiko.openstack import tests
 from tobiko.shell import sh
 from tobiko.tests.faults.ha import cloud_disruptions
 from tobiko.tripleo import pacemaker
 from tobiko.tripleo import processes
@ -359,8 +361,29 @@ class DisruptTripleoNodesTest(testtools.TestCase):
    @overcloud.skip_unless_ovn_bgp_agent
    def test_restart_frr(self):
        OvercloudHealthCheck.run_before()
-        cloud_disruptions.restart_service_on_all_nodes(
+
-            topology.get_agent_service_name(neutron.FRR))
+        frr_service = topology.get_agent_service_name(neutron.FRR)
        # restart frr on all computes
        computes = topology.list_openstack_nodes(group='compute')
        cloud_disruptions.restart_service_on_nodes(frr_service, computes)
        # restart frr on all networkers
        if 'networker' in topology.list_openstack_node_groups():
            networkers = topology.list_openstack_nodes(group='networker')
            cloud_disruptions.restart_service_on_nodes(frr_service, networkers)
        # restart frr on one controller (in order to avoid quorum issues)
        controller = random.choice(topology.list_openstack_nodes(
            group='controller'))
        sh.stop_systemd_units(frr_service, ssh_client=controller.ssh_client)
        start_time = tobiko.time()
        if not pacemaker.fencing_deployed():
            # when not fencing, the test just starts frr on the controller
            sh.start_systemd_units(
                frr_service, ssh_client=controller.ssh_client)
        else:
            # when fencing, the controller is rebooted automatically
            cloud_disruptions.check_overcloud_node_uptime(
                controller.ssh_client, start_time)
        OvercloudHealthCheck.run_after()
    def test_restart_neutron(self):