From 42b98cc517cb6b8cfbff914a44a58b72e6265659 Mon Sep 17 00:00:00 2001 From: pinikomarov Date: Wed, 22 Jan 2020 02:03:22 +0200 Subject: [PATCH] basic HA test : disruptive action and health checks Tested succesfully via Infrared on both OSP13/16 (rhel7/8) Change-Id: I27e23593f09d7ae8f0811b85d35c0df15dd56593 --- tobiko/tests/faults/ha/__init__.py | 0 tobiko/tests/faults/ha/cloud_disruptions.py | 27 +++++ tobiko/tests/faults/ha/test_cloud_recovery.py | 67 +++++++++++ tobiko/tests/faults/test_cloud.py | 26 ----- tobiko/tripleo/pacemaker.py | 107 +++++++++++------- tobiko/tripleo/processes.py | 5 + 6 files changed, 168 insertions(+), 64 deletions(-) create mode 100644 tobiko/tests/faults/ha/__init__.py create mode 100644 tobiko/tests/faults/ha/cloud_disruptions.py create mode 100644 tobiko/tests/faults/ha/test_cloud_recovery.py delete mode 100644 tobiko/tests/faults/test_cloud.py diff --git a/tobiko/tests/faults/ha/__init__.py b/tobiko/tests/faults/ha/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tobiko/tests/faults/ha/cloud_disruptions.py b/tobiko/tests/faults/ha/cloud_disruptions.py new file mode 100644 index 000000000..73b6e17d4 --- /dev/null +++ b/tobiko/tests/faults/ha/cloud_disruptions.py @@ -0,0 +1,27 @@ + +from __future__ import absolute_import + +import tobiko +from tobiko.shell import sh +from tobiko.openstack import topology +from oslo_log import log + + +LOG = log.getLogger(__name__) + + +def reset_all_controller_nodes_sequentially(): + + # reboot all controllers and wait for ssh Up on them + nodes = topology.list_openstack_nodes(group='controller') + for controller in nodes: + sh.execute("sudo reboot", ssh_client=controller.ssh_client, + expect_exit_status=None) + LOG.info('rebooted {}'.format(controller.name)) + tobiko.cleanup_fixture(controller.ssh_client) + + for controller in topology.list_openstack_nodes(group='controller'): + controller_checked = sh.execute("hostname", + ssh_client=controller.ssh_client, + expect_exit_status=None).stdout + LOG.info('{} is up '.format(controller_checked)) diff --git a/tobiko/tests/faults/ha/test_cloud_recovery.py b/tobiko/tests/faults/ha/test_cloud_recovery.py new file mode 100644 index 000000000..27b4dd2f1 --- /dev/null +++ b/tobiko/tests/faults/ha/test_cloud_recovery.py @@ -0,0 +1,67 @@ +from __future__ import absolute_import + +import testtools + +from tobiko.tests.faults.ha import cloud_disruptions +from tobiko.tripleo import pacemaker +from tobiko.tripleo import processes + + +def nodes_health_check(): + # this method will be changed in future commit + check_pacemaker_resources_health() + check_overcloud_processes_health() + # TODO: + # Test existing created servers + # ServerStackResourcesTest().test_server_create() + + +# check cluster failed statuses +def check_pacemaker_resources_health(): + return pacemaker.PacemakerResourcesStatus().all_healthy + + +def check_overcloud_processes_health(): + return processes.OvercloudProcessesStatus( + ).basic_overcloud_processes_running + +# TODO: +# class ServerStackResourcesTest(testtools.TestCase): +# +# """Tests connectivity via floating IPs""" +# +# #: Resources stack with floating IP and Nova server +# # TODO move down : +# # stack = tobiko.required_setup_fixture(stacks.CirrosServerStackFixture) +# # stack = tobiko.setup(my_instace) #tobiko.setup(my_instace) +# +# # TODO new instances of the class , give a uniq stack name +# # TODO : create a new CirrosServerStackFixture ? +# # CirrosServerStackNameFixture(stack_name='my-unique-id') +# # tobiko.setup(my_instace) -> tobiko.cleanup(my_instance) +# def test_create_vm(self): +# """Test SSH connectivity to floating IP address""" +# stack = tobiko.setup(my_instace) # tobiko.setup(my_instace) +# tobiko.cleanup(my_instance) +# # TODO : add check if old vm is there +# hostname = sh.get_hostname(ssh_client=self.stack.ssh_client) +# self.assertEqual(self.stack.server_name.lower(), hostname) + + +class RebootNodesTest(testtools.TestCase): + + """ HA Tests: run health check -> disruptive action -> health check + disruptive_action: a function that runs some + disruptive scenarion on a overcloud""" + + def test_reboot_controllers_recovery(self): + nodes_health_check() + cloud_disruptions.reset_all_controller_nodes_sequentially() + nodes_health_check() + + +# [..] +# more tests to folow +# run health checks +# os faults stop rabbitmq service on one controller +# run health checks again diff --git a/tobiko/tests/faults/test_cloud.py b/tobiko/tests/faults/test_cloud.py deleted file mode 100644 index 71e8e21f5..000000000 --- a/tobiko/tests/faults/test_cloud.py +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright (c) 2019 Red Hat -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations -# under the License. -from __future__ import absolute_import - - -import testtools -from tobiko.openstack import os_faults - - -class CloudManagementTest(testtools.TestCase): - - def test_connect(self): - cloud_management = os_faults.get_os_fault_cloud_managenemt() - cloud_management.verify() diff --git a/tobiko/tripleo/pacemaker.py b/tobiko/tripleo/pacemaker.py index c1f04d224..33070bca8 100644 --- a/tobiko/tripleo/pacemaker.py +++ b/tobiko/tripleo/pacemaker.py @@ -7,6 +7,7 @@ import six import tobiko from tobiko.tripleo import overcloud from tobiko.shell import sh +from tobiko.openstack import topology LOG = log.getLogger(__name__) @@ -16,7 +17,7 @@ class PcsResourceException(tobiko.TobikoException): message = "pcs cluster is not in a healthy state" -def get_pcs_resources_table(hostname='controller-0'): +def get_pcs_resources_table(): """ get pcs status from a controller and parse it to have it's resources states in check @@ -51,13 +52,28 @@ def get_pcs_resources_table(hostname='controller-0'): :return: dataframe of pcs resources stats table """ - ssh_client = overcloud.overcloud_ssh_client(hostname) - output = sh.execute("sudo pcs status | grep ocf", - ssh_client=ssh_client).stdout - stream = six.StringIO(output) - table = pandas.read_csv(stream, delim_whitespace=True, header=None) - table.columns = ['resource', 'resource_type', 'resource_state', - 'overcloud_node'] + # TODO make more robust(done, need other methods to be too) + # TODO make table.columns retry without exception + + nodes = topology.list_openstack_nodes(group='controller') + controller_node = nodes[0].name + ssh_client = overcloud.overcloud_ssh_client(controller_node) + + # prevent pcs table read failure while pacemaker is starting + while True: + try: + output = sh.execute("sudo pcs status | grep ocf", + ssh_client=ssh_client, + expect_exit_status=None).stdout + stream = six.StringIO(output) + table = pandas.read_csv(stream, delim_whitespace=True, header=None) + + table.columns = ['resource', 'resource_type', 'resource_state', + 'overcloud_node'] + except ValueError: + pass + else: + break LOG.debug("Got pcs status :\n%s", table) return table @@ -152,23 +168,26 @@ class PacemakerResourcesStatus(object): return False def ovn_resource_healthy(self): - nodes_num = self.resource_count("(ocf::heartbeat:redis):") - if nodes_num > 0: - return True - else: - master_num = self.resource_count_in_state( - "(ocf::heartbeat:redis):", "Master") - slave_num = self.resource_count_in_state( - "(ocf::heartbeat:redis):", "Slave") - if (master_num == 1) and (slave_num == nodes_num - master_num): - LOG.info( - "pcs status check: resource ovn is in healthy state") + if self.container_runtime() == 'podman': + nodes_num = self.resource_count("(ocf::heartbeat:redis):") + if nodes_num > 0: return True else: - LOG.info( - "pcs status check: resource ovn is in not in " - "healthy state") - return False + master_num = self.resource_count_in_state( + "(ocf::heartbeat:redis):", "Master") + slave_num = self.resource_count_in_state( + "(ocf::heartbeat:redis):", "Slave") + if (master_num == 1) and (slave_num == nodes_num - master_num): + LOG.info( + "pcs status check: resource ovn is in healthy state") + return True + else: + LOG.info( + "pcs status check: resource ovn is in not in " + "healthy state") + return False + else: + return True @property def all_healthy(self): @@ -177,23 +196,35 @@ class PacemakerResourcesStatus(object): and return a global healthy status :return: Bool """ - if all([ - self.rabbitmq_resource_healthy(), - self.galera_resource_healthy(), - self.redis_resource_healthy(), - self.vips_resource_healthy(), - self.ha_proxy_cinder_healthy(), - self.ovn_resource_healthy() - ]): - LOG.info("pcs status checks: all resources are in healthy state") - return True - else: - LOG.info("pcs status check: not all resources are in healthy " - "state") - raise PcsResourceException() + for _ in range(360): + + try: + + if all([ + self.rabbitmq_resource_healthy(), + self.galera_resource_healthy(), + self.redis_resource_healthy(), + self.vips_resource_healthy(), + self.ha_proxy_cinder_healthy(), + self.ovn_resource_healthy() + ]): + LOG.info("pcs status checks: all resources are" + " in healthy state") + return True + else: + LOG.info("pcs status check: not all resources are " + "in healthy " + "state") + raise PcsResourceException() + except PcsResourceException: + # reread pcs status + self.pcs_df = get_pcs_resources_table() + # exhausted all retries + return False -def get_overcloud_nodes_running_pcs_resource(resource=None, resource_type=None, +def get_overcloud_nodes_running_pcs_resource(resource=None, + resource_type=None, resource_state=None): """ Check what nodes are running the specified resource/type/state diff --git a/tobiko/tripleo/processes.py b/tobiko/tripleo/processes.py index 98cf42a99..a39ff0260 100644 --- a/tobiko/tripleo/processes.py +++ b/tobiko/tripleo/processes.py @@ -111,6 +111,11 @@ class OvercloudProcessesStatus(object): :return: Bool """ for process_name in self.processes_to_check: + # osp16/python3 process is "neutron-server:" + if process_name == 'neutron-server' and \ + self.oc_procs_df.query('PROCESS=="{}"'.format( + process_name)).empty: + process_name = 'neutron-server:' if not self.oc_procs_df.query('PROCESS=="{}"'.format( process_name)).empty: LOG.info("overcloud processes status checks: process {} is "