basic HA test : disruptive action and health checks
Tested succesfully via Infrared on both OSP13/16 (rhel7/8) Change-Id: I27e23593f09d7ae8f0811b85d35c0df15dd56593
This commit is contained in:
parent
20e53b4762
commit
42b98cc517
0
tobiko/tests/faults/ha/__init__.py
Normal file
0
tobiko/tests/faults/ha/__init__.py
Normal file
27
tobiko/tests/faults/ha/cloud_disruptions.py
Normal file
27
tobiko/tests/faults/ha/cloud_disruptions.py
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
|
|
||||||
|
import tobiko
|
||||||
|
from tobiko.shell import sh
|
||||||
|
from tobiko.openstack import topology
|
||||||
|
from oslo_log import log
|
||||||
|
|
||||||
|
|
||||||
|
LOG = log.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def reset_all_controller_nodes_sequentially():
|
||||||
|
|
||||||
|
# reboot all controllers and wait for ssh Up on them
|
||||||
|
nodes = topology.list_openstack_nodes(group='controller')
|
||||||
|
for controller in nodes:
|
||||||
|
sh.execute("sudo reboot", ssh_client=controller.ssh_client,
|
||||||
|
expect_exit_status=None)
|
||||||
|
LOG.info('rebooted {}'.format(controller.name))
|
||||||
|
tobiko.cleanup_fixture(controller.ssh_client)
|
||||||
|
|
||||||
|
for controller in topology.list_openstack_nodes(group='controller'):
|
||||||
|
controller_checked = sh.execute("hostname",
|
||||||
|
ssh_client=controller.ssh_client,
|
||||||
|
expect_exit_status=None).stdout
|
||||||
|
LOG.info('{} is up '.format(controller_checked))
|
67
tobiko/tests/faults/ha/test_cloud_recovery.py
Normal file
67
tobiko/tests/faults/ha/test_cloud_recovery.py
Normal file
@ -0,0 +1,67 @@
|
|||||||
|
from __future__ import absolute_import
|
||||||
|
|
||||||
|
import testtools
|
||||||
|
|
||||||
|
from tobiko.tests.faults.ha import cloud_disruptions
|
||||||
|
from tobiko.tripleo import pacemaker
|
||||||
|
from tobiko.tripleo import processes
|
||||||
|
|
||||||
|
|
||||||
|
def nodes_health_check():
|
||||||
|
# this method will be changed in future commit
|
||||||
|
check_pacemaker_resources_health()
|
||||||
|
check_overcloud_processes_health()
|
||||||
|
# TODO:
|
||||||
|
# Test existing created servers
|
||||||
|
# ServerStackResourcesTest().test_server_create()
|
||||||
|
|
||||||
|
|
||||||
|
# check cluster failed statuses
|
||||||
|
def check_pacemaker_resources_health():
|
||||||
|
return pacemaker.PacemakerResourcesStatus().all_healthy
|
||||||
|
|
||||||
|
|
||||||
|
def check_overcloud_processes_health():
|
||||||
|
return processes.OvercloudProcessesStatus(
|
||||||
|
).basic_overcloud_processes_running
|
||||||
|
|
||||||
|
# TODO:
|
||||||
|
# class ServerStackResourcesTest(testtools.TestCase):
|
||||||
|
#
|
||||||
|
# """Tests connectivity via floating IPs"""
|
||||||
|
#
|
||||||
|
# #: Resources stack with floating IP and Nova server
|
||||||
|
# # TODO move down :
|
||||||
|
# # stack = tobiko.required_setup_fixture(stacks.CirrosServerStackFixture)
|
||||||
|
# # stack = tobiko.setup(my_instace) #tobiko.setup(my_instace)
|
||||||
|
#
|
||||||
|
# # TODO new instances of the class , give a uniq stack name
|
||||||
|
# # TODO : create a new CirrosServerStackFixture ?
|
||||||
|
# # CirrosServerStackNameFixture(stack_name='my-unique-id')
|
||||||
|
# # tobiko.setup(my_instace) -> tobiko.cleanup(my_instance)
|
||||||
|
# def test_create_vm(self):
|
||||||
|
# """Test SSH connectivity to floating IP address"""
|
||||||
|
# stack = tobiko.setup(my_instace) # tobiko.setup(my_instace)
|
||||||
|
# tobiko.cleanup(my_instance)
|
||||||
|
# # TODO : add check if old vm is there
|
||||||
|
# hostname = sh.get_hostname(ssh_client=self.stack.ssh_client)
|
||||||
|
# self.assertEqual(self.stack.server_name.lower(), hostname)
|
||||||
|
|
||||||
|
|
||||||
|
class RebootNodesTest(testtools.TestCase):
|
||||||
|
|
||||||
|
""" HA Tests: run health check -> disruptive action -> health check
|
||||||
|
disruptive_action: a function that runs some
|
||||||
|
disruptive scenarion on a overcloud"""
|
||||||
|
|
||||||
|
def test_reboot_controllers_recovery(self):
|
||||||
|
nodes_health_check()
|
||||||
|
cloud_disruptions.reset_all_controller_nodes_sequentially()
|
||||||
|
nodes_health_check()
|
||||||
|
|
||||||
|
|
||||||
|
# [..]
|
||||||
|
# more tests to folow
|
||||||
|
# run health checks
|
||||||
|
# os faults stop rabbitmq service on one controller
|
||||||
|
# run health checks again
|
@ -1,26 +0,0 @@
|
|||||||
# Copyright (c) 2019 Red Hat
|
|
||||||
# All Rights Reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
|
||||||
# not use this file except in compliance with the License. You may obtain
|
|
||||||
# a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
||||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
||||||
# License for the specific language governing permissions and limitations
|
|
||||||
# under the License.
|
|
||||||
from __future__ import absolute_import
|
|
||||||
|
|
||||||
|
|
||||||
import testtools
|
|
||||||
from tobiko.openstack import os_faults
|
|
||||||
|
|
||||||
|
|
||||||
class CloudManagementTest(testtools.TestCase):
|
|
||||||
|
|
||||||
def test_connect(self):
|
|
||||||
cloud_management = os_faults.get_os_fault_cloud_managenemt()
|
|
||||||
cloud_management.verify()
|
|
@ -7,6 +7,7 @@ import six
|
|||||||
import tobiko
|
import tobiko
|
||||||
from tobiko.tripleo import overcloud
|
from tobiko.tripleo import overcloud
|
||||||
from tobiko.shell import sh
|
from tobiko.shell import sh
|
||||||
|
from tobiko.openstack import topology
|
||||||
|
|
||||||
|
|
||||||
LOG = log.getLogger(__name__)
|
LOG = log.getLogger(__name__)
|
||||||
@ -16,7 +17,7 @@ class PcsResourceException(tobiko.TobikoException):
|
|||||||
message = "pcs cluster is not in a healthy state"
|
message = "pcs cluster is not in a healthy state"
|
||||||
|
|
||||||
|
|
||||||
def get_pcs_resources_table(hostname='controller-0'):
|
def get_pcs_resources_table():
|
||||||
"""
|
"""
|
||||||
get pcs status from a controller and parse it
|
get pcs status from a controller and parse it
|
||||||
to have it's resources states in check
|
to have it's resources states in check
|
||||||
@ -51,13 +52,28 @@ def get_pcs_resources_table(hostname='controller-0'):
|
|||||||
|
|
||||||
:return: dataframe of pcs resources stats table
|
:return: dataframe of pcs resources stats table
|
||||||
"""
|
"""
|
||||||
ssh_client = overcloud.overcloud_ssh_client(hostname)
|
# TODO make more robust(done, need other methods to be too)
|
||||||
output = sh.execute("sudo pcs status | grep ocf",
|
# TODO make table.columns retry without exception
|
||||||
ssh_client=ssh_client).stdout
|
|
||||||
stream = six.StringIO(output)
|
nodes = topology.list_openstack_nodes(group='controller')
|
||||||
table = pandas.read_csv(stream, delim_whitespace=True, header=None)
|
controller_node = nodes[0].name
|
||||||
table.columns = ['resource', 'resource_type', 'resource_state',
|
ssh_client = overcloud.overcloud_ssh_client(controller_node)
|
||||||
'overcloud_node']
|
|
||||||
|
# prevent pcs table read failure while pacemaker is starting
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
output = sh.execute("sudo pcs status | grep ocf",
|
||||||
|
ssh_client=ssh_client,
|
||||||
|
expect_exit_status=None).stdout
|
||||||
|
stream = six.StringIO(output)
|
||||||
|
table = pandas.read_csv(stream, delim_whitespace=True, header=None)
|
||||||
|
|
||||||
|
table.columns = ['resource', 'resource_type', 'resource_state',
|
||||||
|
'overcloud_node']
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
break
|
||||||
LOG.debug("Got pcs status :\n%s", table)
|
LOG.debug("Got pcs status :\n%s", table)
|
||||||
return table
|
return table
|
||||||
|
|
||||||
@ -152,23 +168,26 @@ class PacemakerResourcesStatus(object):
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
def ovn_resource_healthy(self):
|
def ovn_resource_healthy(self):
|
||||||
nodes_num = self.resource_count("(ocf::heartbeat:redis):")
|
if self.container_runtime() == 'podman':
|
||||||
if nodes_num > 0:
|
nodes_num = self.resource_count("(ocf::heartbeat:redis):")
|
||||||
return True
|
if nodes_num > 0:
|
||||||
else:
|
|
||||||
master_num = self.resource_count_in_state(
|
|
||||||
"(ocf::heartbeat:redis):", "Master")
|
|
||||||
slave_num = self.resource_count_in_state(
|
|
||||||
"(ocf::heartbeat:redis):", "Slave")
|
|
||||||
if (master_num == 1) and (slave_num == nodes_num - master_num):
|
|
||||||
LOG.info(
|
|
||||||
"pcs status check: resource ovn is in healthy state")
|
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
LOG.info(
|
master_num = self.resource_count_in_state(
|
||||||
"pcs status check: resource ovn is in not in "
|
"(ocf::heartbeat:redis):", "Master")
|
||||||
"healthy state")
|
slave_num = self.resource_count_in_state(
|
||||||
return False
|
"(ocf::heartbeat:redis):", "Slave")
|
||||||
|
if (master_num == 1) and (slave_num == nodes_num - master_num):
|
||||||
|
LOG.info(
|
||||||
|
"pcs status check: resource ovn is in healthy state")
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
LOG.info(
|
||||||
|
"pcs status check: resource ovn is in not in "
|
||||||
|
"healthy state")
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
return True
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def all_healthy(self):
|
def all_healthy(self):
|
||||||
@ -177,23 +196,35 @@ class PacemakerResourcesStatus(object):
|
|||||||
and return a global healthy status
|
and return a global healthy status
|
||||||
:return: Bool
|
:return: Bool
|
||||||
"""
|
"""
|
||||||
if all([
|
for _ in range(360):
|
||||||
self.rabbitmq_resource_healthy(),
|
|
||||||
self.galera_resource_healthy(),
|
try:
|
||||||
self.redis_resource_healthy(),
|
|
||||||
self.vips_resource_healthy(),
|
if all([
|
||||||
self.ha_proxy_cinder_healthy(),
|
self.rabbitmq_resource_healthy(),
|
||||||
self.ovn_resource_healthy()
|
self.galera_resource_healthy(),
|
||||||
]):
|
self.redis_resource_healthy(),
|
||||||
LOG.info("pcs status checks: all resources are in healthy state")
|
self.vips_resource_healthy(),
|
||||||
return True
|
self.ha_proxy_cinder_healthy(),
|
||||||
else:
|
self.ovn_resource_healthy()
|
||||||
LOG.info("pcs status check: not all resources are in healthy "
|
]):
|
||||||
"state")
|
LOG.info("pcs status checks: all resources are"
|
||||||
raise PcsResourceException()
|
" in healthy state")
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
LOG.info("pcs status check: not all resources are "
|
||||||
|
"in healthy "
|
||||||
|
"state")
|
||||||
|
raise PcsResourceException()
|
||||||
|
except PcsResourceException:
|
||||||
|
# reread pcs status
|
||||||
|
self.pcs_df = get_pcs_resources_table()
|
||||||
|
# exhausted all retries
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def get_overcloud_nodes_running_pcs_resource(resource=None, resource_type=None,
|
def get_overcloud_nodes_running_pcs_resource(resource=None,
|
||||||
|
resource_type=None,
|
||||||
resource_state=None):
|
resource_state=None):
|
||||||
"""
|
"""
|
||||||
Check what nodes are running the specified resource/type/state
|
Check what nodes are running the specified resource/type/state
|
||||||
|
@ -111,6 +111,11 @@ class OvercloudProcessesStatus(object):
|
|||||||
:return: Bool
|
:return: Bool
|
||||||
"""
|
"""
|
||||||
for process_name in self.processes_to_check:
|
for process_name in self.processes_to_check:
|
||||||
|
# osp16/python3 process is "neutron-server:"
|
||||||
|
if process_name == 'neutron-server' and \
|
||||||
|
self.oc_procs_df.query('PROCESS=="{}"'.format(
|
||||||
|
process_name)).empty:
|
||||||
|
process_name = 'neutron-server:'
|
||||||
if not self.oc_procs_df.query('PROCESS=="{}"'.format(
|
if not self.oc_procs_df.query('PROCESS=="{}"'.format(
|
||||||
process_name)).empty:
|
process_name)).empty:
|
||||||
LOG.info("overcloud processes status checks: process {} is "
|
LOG.info("overcloud processes status checks: process {} is "
|
||||||
|
Loading…
Reference in New Issue
Block a user