basic HA test : disruptive action and health checks

Tested succesfully via Infrared on both OSP13/16 (rhel7/8) Change-Id: I27e23593f09d7ae8f0811b85d35c0df15dd56593
2020-01-22 02:03:22 +02:00 · 2020-01-22 02:03:22 +02:00 · 42b98cc517
commit 42b98cc517
parent 20e53b4762
6 changed files with 168 additions and 64 deletions
--- a/tobiko/tests/faults/ha/init.py
+++ b/tobiko/tests/faults/ha/init.py
--- a/tobiko/tests/faults/ha/cloud_disruptions.py
+++ b/tobiko/tests/faults/ha/cloud_disruptions.py
@ -0,0 +1,27 @@
 from __future__ import absolute_import
 import tobiko
 from tobiko.shell import sh
 from tobiko.openstack import topology
 from oslo_log import log
 LOG = log.getLogger(__name__)
 def reset_all_controller_nodes_sequentially():
    # reboot all controllers and wait for ssh Up on them
    nodes = topology.list_openstack_nodes(group='controller')
    for controller in nodes:
        sh.execute("sudo reboot", ssh_client=controller.ssh_client,
                   expect_exit_status=None)
        LOG.info('rebooted {}'.format(controller.name))
        tobiko.cleanup_fixture(controller.ssh_client)
    for controller in topology.list_openstack_nodes(group='controller'):
        controller_checked = sh.execute("hostname",
                                        ssh_client=controller.ssh_client,
                                        expect_exit_status=None).stdout
        LOG.info('{} is up '.format(controller_checked))
--- a/tobiko/tests/faults/ha/test_cloud_recovery.py
+++ b/tobiko/tests/faults/ha/test_cloud_recovery.py
@ -0,0 +1,67 @@
 from __future__ import absolute_import
 import testtools
 from tobiko.tests.faults.ha import cloud_disruptions
 from tobiko.tripleo import pacemaker
 from tobiko.tripleo import processes
 def nodes_health_check():
    # this method will be changed in future commit
    check_pacemaker_resources_health()
    check_overcloud_processes_health()
    # TODO:
    # Test existing created servers
    # ServerStackResourcesTest().test_server_create()
 # check cluster failed statuses
 def check_pacemaker_resources_health():
    return pacemaker.PacemakerResourcesStatus().all_healthy
 def check_overcloud_processes_health():
    return processes.OvercloudProcessesStatus(
            ).basic_overcloud_processes_running
 # TODO:
 # class ServerStackResourcesTest(testtools.TestCase):
 #
 #     """Tests connectivity via floating IPs"""
 #
 #     #: Resources stack with floating IP and Nova server
 #     # TODO move down :
 #     # stack = tobiko.required_setup_fixture(stacks.CirrosServerStackFixture)
 #     # stack = tobiko.setup(my_instace) #tobiko.setup(my_instace)
 #
 #     # TODO new instances of the class , give a uniq stack name
 #     # TODO : create a new CirrosServerStackFixture ?
 #     #  CirrosServerStackNameFixture(stack_name='my-unique-id')
 #     # tobiko.setup(my_instace) -> tobiko.cleanup(my_instance)
 #     def test_create_vm(self):
 #         """Test SSH connectivity to floating IP address"""
 #         stack = tobiko.setup(my_instace)  # tobiko.setup(my_instace)
 #         tobiko.cleanup(my_instance)
 #         # TODO : add check if old vm is there
 #         hostname = sh.get_hostname(ssh_client=self.stack.ssh_client)
 #         self.assertEqual(self.stack.server_name.lower(), hostname)
 class RebootNodesTest(testtools.TestCase):
    """ HA Tests: run health check -> disruptive action -> health check
    disruptive_action: a function that runs some
    disruptive scenarion on a overcloud"""
    def test_reboot_controllers_recovery(self):
        nodes_health_check()
        cloud_disruptions.reset_all_controller_nodes_sequentially()
        nodes_health_check()
 # [..]
 # more tests to folow
 # run health checks
 # os faults stop rabbitmq service on one controller
 # run health checks again
--- a/tobiko/tests/faults/test_cloud.py
+++ b/tobiko/tests/faults/test_cloud.py
@ -1,26 +0,0 @@
 # Copyright (c) 2019 Red Hat
 # All Rights Reserved.
 #
 #    Licensed under the Apache License, Version 2.0 (the "License"); you may
 #    not use this file except in compliance with the License. You may obtain
 #    a copy of the License at
 #
 #         http://www.apache.org/licenses/LICENSE-2.0
 #
 #    Unless required by applicable law or agreed to in writing, software
 #    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 #    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 #    License for the specific language governing permissions and limitations
 #    under the License.
 from __future__ import absolute_import
 import testtools
 from tobiko.openstack import os_faults
 class CloudManagementTest(testtools.TestCase):
    def test_connect(self):
        cloud_management = os_faults.get_os_fault_cloud_managenemt()
        cloud_management.verify()
--- a/tobiko/tripleo/pacemaker.py
+++ b/tobiko/tripleo/pacemaker.py
@ -7,6 +7,7 @@ import six
 import tobiko
 from tobiko.tripleo import overcloud
 from tobiko.shell import sh
 from tobiko.openstack import topology
 LOG = log.getLogger(__name__)
@ -16,7 +17,7 @@ class PcsResourceException(tobiko.TobikoException):
    message = "pcs cluster is not in a healthy state"
-def get_pcs_resources_table(hostname='controller-0'):
+def get_pcs_resources_table():
    """
    get pcs status from a controller and parse it
    to have it's resources states in check
@ -51,13 +52,28 @@ def get_pcs_resources_table(hostname='controller-0'):
    :return: dataframe of pcs resources stats table
    """
-    ssh_client = overcloud.overcloud_ssh_client(hostname)
+    # TODO make more robust(done, need other methods to be too)
    # TODO make table.columns retry without exception
    nodes = topology.list_openstack_nodes(group='controller')
    controller_node = nodes[0].name
    ssh_client = overcloud.overcloud_ssh_client(controller_node)
    # prevent pcs table read failure while pacemaker is starting
    while True:
        try:
            output = sh.execute("sudo pcs status | grep ocf",
-                        ssh_client=ssh_client).stdout
+                                ssh_client=ssh_client,
                                expect_exit_status=None).stdout
            stream = six.StringIO(output)
            table = pandas.read_csv(stream, delim_whitespace=True, header=None)
            table.columns = ['resource', 'resource_type', 'resource_state',
                             'overcloud_node']
        except ValueError:
            pass
        else:
            break
    LOG.debug("Got pcs status :\n%s", table)
    return table
@ -152,6 +168,7 @@ class PacemakerResourcesStatus(object):
            return False
    def ovn_resource_healthy(self):
        if self.container_runtime() == 'podman':
            nodes_num = self.resource_count("(ocf::heartbeat:redis):")
            if nodes_num > 0:
                return True
@ -169,6 +186,8 @@ class PacemakerResourcesStatus(object):
                        "pcs status check: resource ovn is in not in "
                        "healthy state")
                    return False
        else:
            return True
    @property
    def all_healthy(self):
@ -177,6 +196,10 @@ class PacemakerResourcesStatus(object):
        and return a global healthy status
        :return: Bool
        """
        for _ in range(360):
            try:
                if all([
                   self.rabbitmq_resource_healthy(),
                   self.galera_resource_healthy(),
@ -185,15 +208,23 @@ class PacemakerResourcesStatus(object):
                   self.ha_proxy_cinder_healthy(),
                   self.ovn_resource_healthy()
                   ]):
-            LOG.info("pcs status checks: all resources are in healthy state")
+                    LOG.info("pcs status checks: all resources are"
                             " in healthy state")
                    return True
                else:
-            LOG.info("pcs status check: not all resources are in healthy "
+                    LOG.info("pcs status check: not all resources are "
                             "in healthy "
                             "state")
                    raise PcsResourceException()
            except PcsResourceException:
                # reread pcs status
                self.pcs_df = get_pcs_resources_table()
        # exhausted all retries
        return False
-def get_overcloud_nodes_running_pcs_resource(resource=None, resource_type=None,
+def get_overcloud_nodes_running_pcs_resource(resource=None,
                                             resource_type=None,
                                             resource_state=None):
    """
    Check what nodes are running the specified resource/type/state
--- a/tobiko/tripleo/processes.py
+++ b/tobiko/tripleo/processes.py
@ -111,6 +111,11 @@ class OvercloudProcessesStatus(object):
        :return: Bool
        """
        for process_name in self.processes_to_check:
            # osp16/python3 process is "neutron-server:"
            if process_name == 'neutron-server' and \
                    self.oc_procs_df.query('PROCESS=="{}"'.format(
                    process_name)).empty:
                process_name = 'neutron-server:'
            if not self.oc_procs_df.query('PROCESS=="{}"'.format(
                    process_name)).empty:
                LOG.info("overcloud processes status checks: process {} is  "