test_restart_frr not applied on all controllers

If FRR is restarted on all controllers simultaneously, quorum is lost
from pcs cluster perspective (which is not supported) and recovering
from that situation may not always work.

Change-Id: I7b8dd4deb348c5e33b50e07ee274437ef25f6c78
This commit is contained in:
Eduardo Olivares 2024-09-19 13:52:34 +02:00
parent 9f0d47e870
commit a6e9824990
2 changed files with 29 additions and 2 deletions

View File

@ -584,6 +584,10 @@ def restart_service_on_all_nodes(service):
is running and check the cloud is healthy after they are started again"""
node_names = tripleo.get_overcloud_nodes_running_service(service)
nodes = topology.list_openstack_nodes(hostnames=node_names)
restart_service_on_nodes(service, nodes)
def restart_service_on_nodes(service, nodes):
for node in nodes:
sh.stop_systemd_units(service, ssh_client=node.ssh_client)
for node in nodes:

View File

@ -15,6 +15,7 @@
# under the License.
from __future__ import absolute_import
import random
import typing
import pytest
@ -29,6 +30,7 @@ from tobiko.openstack import nova as nova_osp
from tobiko.openstack import octavia
from tobiko.openstack import topology
from tobiko.openstack import tests
from tobiko.shell import sh
from tobiko.tests.faults.ha import cloud_disruptions
from tobiko.tripleo import pacemaker
from tobiko.tripleo import processes
@ -359,8 +361,29 @@ class DisruptTripleoNodesTest(testtools.TestCase):
@overcloud.skip_unless_ovn_bgp_agent
def test_restart_frr(self):
OvercloudHealthCheck.run_before()
cloud_disruptions.restart_service_on_all_nodes(
topology.get_agent_service_name(neutron.FRR))
frr_service = topology.get_agent_service_name(neutron.FRR)
# restart frr on all computes
computes = topology.list_openstack_nodes(group='compute')
cloud_disruptions.restart_service_on_nodes(frr_service, computes)
# restart frr on all networkers
if 'networker' in topology.list_openstack_node_groups():
networkers = topology.list_openstack_nodes(group='networker')
cloud_disruptions.restart_service_on_nodes(frr_service, networkers)
# restart frr on one controller (in order to avoid quorum issues)
controller = random.choice(topology.list_openstack_nodes(
group='controller'))
sh.stop_systemd_units(frr_service, ssh_client=controller.ssh_client)
start_time = tobiko.time()
if not pacemaker.fencing_deployed():
# when not fencing, the test just starts frr on the controller
sh.start_systemd_units(
frr_service, ssh_client=controller.ssh_client)
else:
# when fencing, the controller is rebooted automatically
cloud_disruptions.check_overcloud_node_uptime(
controller.ssh_client, start_time)
OvercloudHealthCheck.run_after()
def test_restart_neutron(self):