basic HA test : disruptive action and health checks

Tested succesfully via Infrared on both OSP13/16 (rhel7/8)

Change-Id: I27e23593f09d7ae8f0811b85d35c0df15dd56593
This commit is contained in:
pinikomarov 2020-01-22 02:03:22 +02:00 committed by Federico Ressi
parent 20e53b4762
commit 42b98cc517
6 changed files with 168 additions and 64 deletions

View File

View File

@ -0,0 +1,27 @@
from __future__ import absolute_import
import tobiko
from tobiko.shell import sh
from tobiko.openstack import topology
from oslo_log import log
LOG = log.getLogger(__name__)
def reset_all_controller_nodes_sequentially():
# reboot all controllers and wait for ssh Up on them
nodes = topology.list_openstack_nodes(group='controller')
for controller in nodes:
sh.execute("sudo reboot", ssh_client=controller.ssh_client,
expect_exit_status=None)
LOG.info('rebooted {}'.format(controller.name))
tobiko.cleanup_fixture(controller.ssh_client)
for controller in topology.list_openstack_nodes(group='controller'):
controller_checked = sh.execute("hostname",
ssh_client=controller.ssh_client,
expect_exit_status=None).stdout
LOG.info('{} is up '.format(controller_checked))

View File

@ -0,0 +1,67 @@
from __future__ import absolute_import
import testtools
from tobiko.tests.faults.ha import cloud_disruptions
from tobiko.tripleo import pacemaker
from tobiko.tripleo import processes
def nodes_health_check():
# this method will be changed in future commit
check_pacemaker_resources_health()
check_overcloud_processes_health()
# TODO:
# Test existing created servers
# ServerStackResourcesTest().test_server_create()
# check cluster failed statuses
def check_pacemaker_resources_health():
return pacemaker.PacemakerResourcesStatus().all_healthy
def check_overcloud_processes_health():
return processes.OvercloudProcessesStatus(
).basic_overcloud_processes_running
# TODO:
# class ServerStackResourcesTest(testtools.TestCase):
#
# """Tests connectivity via floating IPs"""
#
# #: Resources stack with floating IP and Nova server
# # TODO move down :
# # stack = tobiko.required_setup_fixture(stacks.CirrosServerStackFixture)
# # stack = tobiko.setup(my_instace) #tobiko.setup(my_instace)
#
# # TODO new instances of the class , give a uniq stack name
# # TODO : create a new CirrosServerStackFixture ?
# # CirrosServerStackNameFixture(stack_name='my-unique-id')
# # tobiko.setup(my_instace) -> tobiko.cleanup(my_instance)
# def test_create_vm(self):
# """Test SSH connectivity to floating IP address"""
# stack = tobiko.setup(my_instace) # tobiko.setup(my_instace)
# tobiko.cleanup(my_instance)
# # TODO : add check if old vm is there
# hostname = sh.get_hostname(ssh_client=self.stack.ssh_client)
# self.assertEqual(self.stack.server_name.lower(), hostname)
class RebootNodesTest(testtools.TestCase):
""" HA Tests: run health check -> disruptive action -> health check
disruptive_action: a function that runs some
disruptive scenarion on a overcloud"""
def test_reboot_controllers_recovery(self):
nodes_health_check()
cloud_disruptions.reset_all_controller_nodes_sequentially()
nodes_health_check()
# [..]
# more tests to folow
# run health checks
# os faults stop rabbitmq service on one controller
# run health checks again

View File

@ -1,26 +0,0 @@
# Copyright (c) 2019 Red Hat
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
from __future__ import absolute_import
import testtools
from tobiko.openstack import os_faults
class CloudManagementTest(testtools.TestCase):
def test_connect(self):
cloud_management = os_faults.get_os_fault_cloud_managenemt()
cloud_management.verify()

View File

@ -7,6 +7,7 @@ import six
import tobiko import tobiko
from tobiko.tripleo import overcloud from tobiko.tripleo import overcloud
from tobiko.shell import sh from tobiko.shell import sh
from tobiko.openstack import topology
LOG = log.getLogger(__name__) LOG = log.getLogger(__name__)
@ -16,7 +17,7 @@ class PcsResourceException(tobiko.TobikoException):
message = "pcs cluster is not in a healthy state" message = "pcs cluster is not in a healthy state"
def get_pcs_resources_table(hostname='controller-0'): def get_pcs_resources_table():
""" """
get pcs status from a controller and parse it get pcs status from a controller and parse it
to have it's resources states in check to have it's resources states in check
@ -51,13 +52,28 @@ def get_pcs_resources_table(hostname='controller-0'):
:return: dataframe of pcs resources stats table :return: dataframe of pcs resources stats table
""" """
ssh_client = overcloud.overcloud_ssh_client(hostname) # TODO make more robust(done, need other methods to be too)
output = sh.execute("sudo pcs status | grep ocf", # TODO make table.columns retry without exception
ssh_client=ssh_client).stdout
stream = six.StringIO(output) nodes = topology.list_openstack_nodes(group='controller')
table = pandas.read_csv(stream, delim_whitespace=True, header=None) controller_node = nodes[0].name
table.columns = ['resource', 'resource_type', 'resource_state', ssh_client = overcloud.overcloud_ssh_client(controller_node)
'overcloud_node']
# prevent pcs table read failure while pacemaker is starting
while True:
try:
output = sh.execute("sudo pcs status | grep ocf",
ssh_client=ssh_client,
expect_exit_status=None).stdout
stream = six.StringIO(output)
table = pandas.read_csv(stream, delim_whitespace=True, header=None)
table.columns = ['resource', 'resource_type', 'resource_state',
'overcloud_node']
except ValueError:
pass
else:
break
LOG.debug("Got pcs status :\n%s", table) LOG.debug("Got pcs status :\n%s", table)
return table return table
@ -152,23 +168,26 @@ class PacemakerResourcesStatus(object):
return False return False
def ovn_resource_healthy(self): def ovn_resource_healthy(self):
nodes_num = self.resource_count("(ocf::heartbeat:redis):") if self.container_runtime() == 'podman':
if nodes_num > 0: nodes_num = self.resource_count("(ocf::heartbeat:redis):")
return True if nodes_num > 0:
else:
master_num = self.resource_count_in_state(
"(ocf::heartbeat:redis):", "Master")
slave_num = self.resource_count_in_state(
"(ocf::heartbeat:redis):", "Slave")
if (master_num == 1) and (slave_num == nodes_num - master_num):
LOG.info(
"pcs status check: resource ovn is in healthy state")
return True return True
else: else:
LOG.info( master_num = self.resource_count_in_state(
"pcs status check: resource ovn is in not in " "(ocf::heartbeat:redis):", "Master")
"healthy state") slave_num = self.resource_count_in_state(
return False "(ocf::heartbeat:redis):", "Slave")
if (master_num == 1) and (slave_num == nodes_num - master_num):
LOG.info(
"pcs status check: resource ovn is in healthy state")
return True
else:
LOG.info(
"pcs status check: resource ovn is in not in "
"healthy state")
return False
else:
return True
@property @property
def all_healthy(self): def all_healthy(self):
@ -177,23 +196,35 @@ class PacemakerResourcesStatus(object):
and return a global healthy status and return a global healthy status
:return: Bool :return: Bool
""" """
if all([ for _ in range(360):
self.rabbitmq_resource_healthy(),
self.galera_resource_healthy(), try:
self.redis_resource_healthy(),
self.vips_resource_healthy(), if all([
self.ha_proxy_cinder_healthy(), self.rabbitmq_resource_healthy(),
self.ovn_resource_healthy() self.galera_resource_healthy(),
]): self.redis_resource_healthy(),
LOG.info("pcs status checks: all resources are in healthy state") self.vips_resource_healthy(),
return True self.ha_proxy_cinder_healthy(),
else: self.ovn_resource_healthy()
LOG.info("pcs status check: not all resources are in healthy " ]):
"state") LOG.info("pcs status checks: all resources are"
raise PcsResourceException() " in healthy state")
return True
else:
LOG.info("pcs status check: not all resources are "
"in healthy "
"state")
raise PcsResourceException()
except PcsResourceException:
# reread pcs status
self.pcs_df = get_pcs_resources_table()
# exhausted all retries
return False
def get_overcloud_nodes_running_pcs_resource(resource=None, resource_type=None, def get_overcloud_nodes_running_pcs_resource(resource=None,
resource_type=None,
resource_state=None): resource_state=None):
""" """
Check what nodes are running the specified resource/type/state Check what nodes are running the specified resource/type/state

View File

@ -111,6 +111,11 @@ class OvercloudProcessesStatus(object):
:return: Bool :return: Bool
""" """
for process_name in self.processes_to_check: for process_name in self.processes_to_check:
# osp16/python3 process is "neutron-server:"
if process_name == 'neutron-server' and \
self.oc_procs_df.query('PROCESS=="{}"'.format(
process_name)).empty:
process_name = 'neutron-server:'
if not self.oc_procs_df.query('PROCESS=="{}"'.format( if not self.oc_procs_df.query('PROCESS=="{}"'.format(
process_name)).empty: process_name)).empty:
LOG.info("overcloud processes status checks: process {} is " LOG.info("overcloud processes status checks: process {} is "