basic HA test : disruptive action and health checks
Tested succesfully via Infrared on both OSP13/16 (rhel7/8) Change-Id: I27e23593f09d7ae8f0811b85d35c0df15dd56593
This commit is contained in:
parent
20e53b4762
commit
42b98cc517
0
tobiko/tests/faults/ha/__init__.py
Normal file
0
tobiko/tests/faults/ha/__init__.py
Normal file
27
tobiko/tests/faults/ha/cloud_disruptions.py
Normal file
27
tobiko/tests/faults/ha/cloud_disruptions.py
Normal file
@ -0,0 +1,27 @@
|
||||
|
||||
from __future__ import absolute_import
|
||||
|
||||
import tobiko
|
||||
from tobiko.shell import sh
|
||||
from tobiko.openstack import topology
|
||||
from oslo_log import log
|
||||
|
||||
|
||||
LOG = log.getLogger(__name__)
|
||||
|
||||
|
||||
def reset_all_controller_nodes_sequentially():
|
||||
|
||||
# reboot all controllers and wait for ssh Up on them
|
||||
nodes = topology.list_openstack_nodes(group='controller')
|
||||
for controller in nodes:
|
||||
sh.execute("sudo reboot", ssh_client=controller.ssh_client,
|
||||
expect_exit_status=None)
|
||||
LOG.info('rebooted {}'.format(controller.name))
|
||||
tobiko.cleanup_fixture(controller.ssh_client)
|
||||
|
||||
for controller in topology.list_openstack_nodes(group='controller'):
|
||||
controller_checked = sh.execute("hostname",
|
||||
ssh_client=controller.ssh_client,
|
||||
expect_exit_status=None).stdout
|
||||
LOG.info('{} is up '.format(controller_checked))
|
67
tobiko/tests/faults/ha/test_cloud_recovery.py
Normal file
67
tobiko/tests/faults/ha/test_cloud_recovery.py
Normal file
@ -0,0 +1,67 @@
|
||||
from __future__ import absolute_import
|
||||
|
||||
import testtools
|
||||
|
||||
from tobiko.tests.faults.ha import cloud_disruptions
|
||||
from tobiko.tripleo import pacemaker
|
||||
from tobiko.tripleo import processes
|
||||
|
||||
|
||||
def nodes_health_check():
|
||||
# this method will be changed in future commit
|
||||
check_pacemaker_resources_health()
|
||||
check_overcloud_processes_health()
|
||||
# TODO:
|
||||
# Test existing created servers
|
||||
# ServerStackResourcesTest().test_server_create()
|
||||
|
||||
|
||||
# check cluster failed statuses
|
||||
def check_pacemaker_resources_health():
|
||||
return pacemaker.PacemakerResourcesStatus().all_healthy
|
||||
|
||||
|
||||
def check_overcloud_processes_health():
|
||||
return processes.OvercloudProcessesStatus(
|
||||
).basic_overcloud_processes_running
|
||||
|
||||
# TODO:
|
||||
# class ServerStackResourcesTest(testtools.TestCase):
|
||||
#
|
||||
# """Tests connectivity via floating IPs"""
|
||||
#
|
||||
# #: Resources stack with floating IP and Nova server
|
||||
# # TODO move down :
|
||||
# # stack = tobiko.required_setup_fixture(stacks.CirrosServerStackFixture)
|
||||
# # stack = tobiko.setup(my_instace) #tobiko.setup(my_instace)
|
||||
#
|
||||
# # TODO new instances of the class , give a uniq stack name
|
||||
# # TODO : create a new CirrosServerStackFixture ?
|
||||
# # CirrosServerStackNameFixture(stack_name='my-unique-id')
|
||||
# # tobiko.setup(my_instace) -> tobiko.cleanup(my_instance)
|
||||
# def test_create_vm(self):
|
||||
# """Test SSH connectivity to floating IP address"""
|
||||
# stack = tobiko.setup(my_instace) # tobiko.setup(my_instace)
|
||||
# tobiko.cleanup(my_instance)
|
||||
# # TODO : add check if old vm is there
|
||||
# hostname = sh.get_hostname(ssh_client=self.stack.ssh_client)
|
||||
# self.assertEqual(self.stack.server_name.lower(), hostname)
|
||||
|
||||
|
||||
class RebootNodesTest(testtools.TestCase):
|
||||
|
||||
""" HA Tests: run health check -> disruptive action -> health check
|
||||
disruptive_action: a function that runs some
|
||||
disruptive scenarion on a overcloud"""
|
||||
|
||||
def test_reboot_controllers_recovery(self):
|
||||
nodes_health_check()
|
||||
cloud_disruptions.reset_all_controller_nodes_sequentially()
|
||||
nodes_health_check()
|
||||
|
||||
|
||||
# [..]
|
||||
# more tests to folow
|
||||
# run health checks
|
||||
# os faults stop rabbitmq service on one controller
|
||||
# run health checks again
|
@ -1,26 +0,0 @@
|
||||
# Copyright (c) 2019 Red Hat
|
||||
# All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License. You may obtain
|
||||
# a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
from __future__ import absolute_import
|
||||
|
||||
|
||||
import testtools
|
||||
from tobiko.openstack import os_faults
|
||||
|
||||
|
||||
class CloudManagementTest(testtools.TestCase):
|
||||
|
||||
def test_connect(self):
|
||||
cloud_management = os_faults.get_os_fault_cloud_managenemt()
|
||||
cloud_management.verify()
|
@ -7,6 +7,7 @@ import six
|
||||
import tobiko
|
||||
from tobiko.tripleo import overcloud
|
||||
from tobiko.shell import sh
|
||||
from tobiko.openstack import topology
|
||||
|
||||
|
||||
LOG = log.getLogger(__name__)
|
||||
@ -16,7 +17,7 @@ class PcsResourceException(tobiko.TobikoException):
|
||||
message = "pcs cluster is not in a healthy state"
|
||||
|
||||
|
||||
def get_pcs_resources_table(hostname='controller-0'):
|
||||
def get_pcs_resources_table():
|
||||
"""
|
||||
get pcs status from a controller and parse it
|
||||
to have it's resources states in check
|
||||
@ -51,13 +52,28 @@ def get_pcs_resources_table(hostname='controller-0'):
|
||||
|
||||
:return: dataframe of pcs resources stats table
|
||||
"""
|
||||
ssh_client = overcloud.overcloud_ssh_client(hostname)
|
||||
output = sh.execute("sudo pcs status | grep ocf",
|
||||
ssh_client=ssh_client).stdout
|
||||
stream = six.StringIO(output)
|
||||
table = pandas.read_csv(stream, delim_whitespace=True, header=None)
|
||||
table.columns = ['resource', 'resource_type', 'resource_state',
|
||||
'overcloud_node']
|
||||
# TODO make more robust(done, need other methods to be too)
|
||||
# TODO make table.columns retry without exception
|
||||
|
||||
nodes = topology.list_openstack_nodes(group='controller')
|
||||
controller_node = nodes[0].name
|
||||
ssh_client = overcloud.overcloud_ssh_client(controller_node)
|
||||
|
||||
# prevent pcs table read failure while pacemaker is starting
|
||||
while True:
|
||||
try:
|
||||
output = sh.execute("sudo pcs status | grep ocf",
|
||||
ssh_client=ssh_client,
|
||||
expect_exit_status=None).stdout
|
||||
stream = six.StringIO(output)
|
||||
table = pandas.read_csv(stream, delim_whitespace=True, header=None)
|
||||
|
||||
table.columns = ['resource', 'resource_type', 'resource_state',
|
||||
'overcloud_node']
|
||||
except ValueError:
|
||||
pass
|
||||
else:
|
||||
break
|
||||
LOG.debug("Got pcs status :\n%s", table)
|
||||
return table
|
||||
|
||||
@ -152,23 +168,26 @@ class PacemakerResourcesStatus(object):
|
||||
return False
|
||||
|
||||
def ovn_resource_healthy(self):
|
||||
nodes_num = self.resource_count("(ocf::heartbeat:redis):")
|
||||
if nodes_num > 0:
|
||||
return True
|
||||
else:
|
||||
master_num = self.resource_count_in_state(
|
||||
"(ocf::heartbeat:redis):", "Master")
|
||||
slave_num = self.resource_count_in_state(
|
||||
"(ocf::heartbeat:redis):", "Slave")
|
||||
if (master_num == 1) and (slave_num == nodes_num - master_num):
|
||||
LOG.info(
|
||||
"pcs status check: resource ovn is in healthy state")
|
||||
if self.container_runtime() == 'podman':
|
||||
nodes_num = self.resource_count("(ocf::heartbeat:redis):")
|
||||
if nodes_num > 0:
|
||||
return True
|
||||
else:
|
||||
LOG.info(
|
||||
"pcs status check: resource ovn is in not in "
|
||||
"healthy state")
|
||||
return False
|
||||
master_num = self.resource_count_in_state(
|
||||
"(ocf::heartbeat:redis):", "Master")
|
||||
slave_num = self.resource_count_in_state(
|
||||
"(ocf::heartbeat:redis):", "Slave")
|
||||
if (master_num == 1) and (slave_num == nodes_num - master_num):
|
||||
LOG.info(
|
||||
"pcs status check: resource ovn is in healthy state")
|
||||
return True
|
||||
else:
|
||||
LOG.info(
|
||||
"pcs status check: resource ovn is in not in "
|
||||
"healthy state")
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
@property
|
||||
def all_healthy(self):
|
||||
@ -177,23 +196,35 @@ class PacemakerResourcesStatus(object):
|
||||
and return a global healthy status
|
||||
:return: Bool
|
||||
"""
|
||||
if all([
|
||||
self.rabbitmq_resource_healthy(),
|
||||
self.galera_resource_healthy(),
|
||||
self.redis_resource_healthy(),
|
||||
self.vips_resource_healthy(),
|
||||
self.ha_proxy_cinder_healthy(),
|
||||
self.ovn_resource_healthy()
|
||||
]):
|
||||
LOG.info("pcs status checks: all resources are in healthy state")
|
||||
return True
|
||||
else:
|
||||
LOG.info("pcs status check: not all resources are in healthy "
|
||||
"state")
|
||||
raise PcsResourceException()
|
||||
for _ in range(360):
|
||||
|
||||
try:
|
||||
|
||||
if all([
|
||||
self.rabbitmq_resource_healthy(),
|
||||
self.galera_resource_healthy(),
|
||||
self.redis_resource_healthy(),
|
||||
self.vips_resource_healthy(),
|
||||
self.ha_proxy_cinder_healthy(),
|
||||
self.ovn_resource_healthy()
|
||||
]):
|
||||
LOG.info("pcs status checks: all resources are"
|
||||
" in healthy state")
|
||||
return True
|
||||
else:
|
||||
LOG.info("pcs status check: not all resources are "
|
||||
"in healthy "
|
||||
"state")
|
||||
raise PcsResourceException()
|
||||
except PcsResourceException:
|
||||
# reread pcs status
|
||||
self.pcs_df = get_pcs_resources_table()
|
||||
# exhausted all retries
|
||||
return False
|
||||
|
||||
|
||||
def get_overcloud_nodes_running_pcs_resource(resource=None, resource_type=None,
|
||||
def get_overcloud_nodes_running_pcs_resource(resource=None,
|
||||
resource_type=None,
|
||||
resource_state=None):
|
||||
"""
|
||||
Check what nodes are running the specified resource/type/state
|
||||
|
@ -111,6 +111,11 @@ class OvercloudProcessesStatus(object):
|
||||
:return: Bool
|
||||
"""
|
||||
for process_name in self.processes_to_check:
|
||||
# osp16/python3 process is "neutron-server:"
|
||||
if process_name == 'neutron-server' and \
|
||||
self.oc_procs_df.query('PROCESS=="{}"'.format(
|
||||
process_name)).empty:
|
||||
process_name = 'neutron-server:'
|
||||
if not self.oc_procs_df.query('PROCESS=="{}"'.format(
|
||||
process_name)).empty:
|
||||
LOG.info("overcloud processes status checks: process {} is "
|
||||
|
Loading…
Reference in New Issue
Block a user