pidone: ban resources (redis and ovn-dbs) and check another resource is

promoted

Change-Id: Ic62ec49a03c9bd0314b953cc8f780740c81865b0
This commit is contained in:
dabarzil 2021-04-21 15:55:08 +03:00 committed by Federico Ressi
parent a560d4a4e7
commit 6deadc5d4b
3 changed files with 87 additions and 1 deletions

View File

@ -37,6 +37,8 @@ network_disruption = """
undisrupt_network = """
sudo iptables-restore /home/heat-admin/working.iptables.rules
"""
ban_resource = "sudo pcs resource ban {} {}"
clear_resource = "sudo pcs resource clear {} {}"
ovn_db_pcs_resource_restart = "sudo pcs resource restart ovn-dbs-bundle"
kill_rabbit = "sudo kill -9 $(pgrep beam.smp)"
kill_galera = "sudo kill -9 $(pgrep mysqld)"
@ -59,6 +61,10 @@ class PcsEnableException(tobiko.TobikoException):
message = "pcs enable didn't start the resource"
class PcsBanException(tobiko.TobikoException):
message = "the resource wasn't banned"
class GaleraBoostrapException(tobiko.TobikoException):
message = "Bootstrap should not be done from node without grastate.dat"
@ -337,6 +343,54 @@ def reset_ovndb_master_container():
container_host=node)
def ban_master_resource(resource_type, resource_name):
"""ban master resource and check that it stopped
and another node is promoted to master"""
nodes = topology.list_openstack_nodes(group='controller')
resource_num = pacemaker.PacemakerResourcesStatus().resource_count(
resource_type)
# repeat process for all nodes except one
for i in range(resource_num - 1):
master_node_name = pacemaker.get_resource_master_node(resource_type)
if not master_node_name:
break
else:
sh.execute(ban_resource.format(resource_name,
master_node_name[0]),
ssh_client=topology.get_openstack_node(
master_node_name[0]).ssh_client)
for attempt_number in range(60):
try:
# check if resource banned and another slave promoted
if pacemaker.PacemakerResourcesStatus().resource_banned(
resource_type):
# if there one resource left(master), test succeded
if i == resource_num - 2:
clear_resources(nodes, resource_name)
time.sleep(10)
return
# more than 2 resources, so repeat process
else:
time.sleep(20)
break
else:
raise PcsBanException()
except PcsBanException():
LOG.info('Retrying pacemaker resource checks attempt '
'{} of 60'.format(attempt_number))
time.sleep(1)
clear_resources(nodes, resource_name)
tobiko.fail('The resource {} was not promoted to master'.format(
resource_name))
def clear_resources(nodes, resource_name):
for cont in range(len(nodes)):
sh.execute(clear_resource.format(resource_name, 'controller-{}'.
format(cont)),
ssh_client=nodes[0].ssh_client)
def kill_rabbitmq_service():
"""kill a rabbit process on a random controller,
check in pacemaker it is down"""

View File

@ -245,6 +245,17 @@ class DisruptTripleoNodesTest(testtools.TestCase):
LOG.info("Verify can create VMs after controllers power on...")
tests.test_server_creation()
def test_ban_redis(self):
OvercloudHealthCheck.run_before()
cloud_disruptions.ban_master_resource("(ocf::heartbeat:redis):",
"redis-bundle")
OvercloudHealthCheck.run_after()
def test_ban_ovs(self):
OvercloudHealthCheck.run_before()
cloud_disruptions.ban_master_resource("(ocf::ovn:ovndb-servers):",
"ovn-dbs-bundle")
OvercloudHealthCheck.run_after()
# [..]
# more tests to follow

View File

@ -224,6 +224,27 @@ class PacemakerResourcesStatus(object):
# exhausted all retries
tobiko.fail('pcs cluster is not in a healthy state')
def resource_banned(self, resource_type):
self.pcs_df = get_pcs_resources_table()
nodes_num = self.resource_count(resource_type)
master_num = self.resource_count_in_state(
resource_type, "Master")
slave_num = self.resource_count_in_state(
resource_type, "Slave")
banned_num = self.resource_count_in_state(
resource_type, "Stopped")
if (master_num == 1 and banned_num >= 1) and\
(slave_num == nodes_num - master_num - banned_num):
LOG.info("""pcs status check: resource has been banned successfully
and another one has been promoted""")
return True
elif banned_num == 0:
LOG.info("pcs status check: resource has not been banned")
return False
else:
LOG.info("pcs status check: resource is in not in a healthy state")
return False
def get_overcloud_nodes_running_pcs_resource(resource=None,
resource_type=None,
@ -253,7 +274,7 @@ def get_overcloud_nodes_running_pcs_resource(resource=None,
def get_resource_master_node(resource_type=None):
get_overcloud_nodes_running_pcs_resource(
return get_overcloud_nodes_running_pcs_resource(
resource_type=resource_type, resource_state='Master')