tobiko/tobiko/tests/faults/ha/cloud_disruptions.py

# Copyright (c) 2021 Red Hat, Inc.
#
# All Rights Reserved.
#
#    Licensed under the Apache License, Version 2.0 (the "License"); you may
#    not use this file except in compliance with the License. You may obtain
#    a copy of the License at
#
#         http://www.apache.org/licenses/LICENSE-2.0
#
#    Unless required by applicable law or agreed to in writing, software
#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
#    License for the specific language governing permissions and limitations
#    under the License.
from __future__ import absolute_import

from datetime import datetime
import math
import random
import re
import socket
import time
import urllib.parse

import netaddr
from oslo_log import log

import tobiko
from tobiko import config
from tobiko.openstack import glance
from tobiko.openstack import keystone
from tobiko.openstack import neutron
from tobiko.openstack import stacks
from tobiko.openstack import tests
from tobiko.openstack import topology
from tobiko.tests.faults.ha import test_cloud_recovery
from tobiko.shell import ping
from tobiko.shell import sh
from tobiko.tripleo import containers
from tobiko.tripleo import nova
from tobiko.tripleo import pacemaker
from tobiko.tripleo import topology as tripleo_topology
from tobiko import tripleo


CONF = config.CONF
LOG = log.getLogger(__name__)

network_disruption = """
 sudo iptables-save > ~/working.iptables.rules &&
 sudo iptables -I INPUT 1 -m state --state RELATED,ESTABLISHED -j ACCEPT &&
 sudo iptables -I INPUT 2 -p tcp -m state --state NEW -m tcp --dport 22 -j \
 ACCEPT &&
 sudo iptables -I INPUT 3 ! -i lo -j DROP &&
 sudo iptables -I OUTPUT 1 -p tcp --sport 22 -j ACCEPT &&
 sudo iptables -I OUTPUT 2 ! -o lo -j DROP
"""

network_disruption_ipv6 = network_disruption.replace('iptables', 'ip6tables')

undisrupt_network = """
 sudo iptables-restore ~/working.iptables.rules
"""

undisrupt_network_ipv6 = undisrupt_network.replace('iptables', 'ip6tables')

# TODO(eolivare): run ovn_db_pcs_resource_restart using
# run_pcs_resource_operation
# Now it is not possible because it is executed with
# ssh_client.connect().exec_command and run_pcs_resource_operation only
# supports sh.execute
ovn_db_pcs_resource_restart = (f"sudo pcs resource restart "
                               f"{pacemaker.OVN_DBS_RESOURCE}")
kill_rabbit = "sudo pkill -9 beam.smp"
remove_grastate = "sudo rm -rf /var/lib/mysql/grastate.dat"
check_bootstrap = """ps -eo lstart,cmd | grep -v grep|
grep wsrep-cluster-address=gcomm://"""
galera_sst_request = """sudo grep 'wsrep_sst_rsync.*'
/var/log/containers/mysql/mysqld.log"""
kill_mysqld = "sudo pkill -9 mysqld"
kill_mariadbd = "sudo pkill -9 mariadbd"


class PcsDisableException(tobiko.TobikoException):
    message = "pcs disable didn't shut down the resource"


class PcsEnableException(tobiko.TobikoException):
    message = "pcs enable didn't start the resource"


class GaleraBoostrapException(tobiko.TobikoException):
    message = "Bootstrap has not been activated"


class TimestampException(tobiko.TobikoException):
    message = "Timestamp mismatch: sst was requested before grastate removal"


def network_disrupt_node(node_name, disrupt_method=network_disruption):
    disrupt_node(node_name, disrupt_method=disrupt_method)


def network_undisrupt_node(node_name, disrupt_method=undisrupt_network):
    disrupt_node(node_name, disrupt_method=disrupt_method)


def disrupt_node(node_name, disrupt_method=network_disruption):
    # reboot all controllers and wait for ssh Up on them
    # hard reset is simultaneous while soft is sequential
    # method : method of disruption to use : network_disruption |
    # container_restart

    start_time = tobiko.time()
    # using ssh_client.connect we use a fire and forget reboot method
    node = tripleo_topology.get_node(node_name)
    node.ssh_client.connect().exec_command(disrupt_method)
    LOG.info('disrupt exec: {} on server: {}'.format(disrupt_method,
                                                     node.name))

    if isinstance(disrupt_method, sh.RebootHostMethod) \
            or is_network_disruption(disrupt_method):
        check_overcloud_node_uptime(node.ssh_client, start_time)
    else:
        check_overcloud_node_responsive(node)


def check_overcloud_node_uptime(ssh_client, start_time):
    for attempt in tobiko.retry(timeout=600., interval=10.):
        try:
            uptime = sh.get_uptime(ssh_client=ssh_client, timeout=15.)
        except sh.UptimeError:
            LOG.exception('uptime command failed')
            uptime = None

        if uptime and uptime < (tobiko.time() - start_time):
            LOG.debug('Reboot has been completed')
            break
        else:
            attempt.check_limits()


def reboot_node(node_name, wait=True, reboot_method=sh.hard_reset_method):

    # reboot a node and wait for ssh Up on them
    # hard reset is simultaneous while soft is sequential
    # method : method of disruption to use : reset | network_disruption

    # using ssh_client.connect we use a fire and forget reboot method
    node = tripleo_topology.get_node(node_name)
    sh.reboot_host(ssh_client=node.ssh_client, wait=wait, method=reboot_method)
    LOG.info('disrupt exec: {} on server: {}'.format(reboot_method,
                                                     node.name))


def check_overcloud_node_responsive(node):
    node_checked = sh.execute("hostname",
                              ssh_client=node.ssh_client,
                              expect_exit_status=None).stdout
    LOG.info('{} is up '.format(node_checked))

    tobiko.cleanup_fixture(node.ssh_client)


def network_disrupt_all_controller_nodes(disrupt_method=network_disruption,
                                         exclude_list=None):
    disrupt_all_controller_nodes(disrupt_method=disrupt_method,
                                 exclude_list=exclude_list)


def reset_all_controller_nodes(disrupt_method=sh.hard_reset_method,
                               exclude_list=None):
    disrupt_all_controller_nodes(disrupt_method=disrupt_method,
                                 exclude_list=exclude_list)


def reset_all_controller_nodes_sequentially(
        disrupt_method=sh.hard_reset_method,
        sequentially=True, exclude_list=None):
    disrupt_all_controller_nodes(disrupt_method=disrupt_method,
                                 sequentially=sequentially,
                                 exclude_list=exclude_list)


def disrupt_all_controller_nodes(disrupt_method=sh.hard_reset_method,
                                 sequentially=False, exclude_list=None):
    # TODO(eolivare): join disrupt_all_controller_nodes and
    # reboot_all_controller_nodes methods because they are very similar

    # reboot all controllers and wait for ssh Up on them
    # method : method of disruptino to use : reset | network_disruption
    # hard reset is simultaneous while soft is sequential
    # exclude_list = list of nodes to NOT reset

    controlplane_groups = ['controller', 'messaging', 'database', 'networker']
    actual_controlplane_groups = tripleo_topology.actual_node_groups(
        controlplane_groups)
    nodes = topology.list_openstack_nodes(group=actual_controlplane_groups)

    # remove excluded nodes from reset list
    if exclude_list:
        nodes = [node for node in nodes if node.name not in exclude_list]

    start_time = {}
    for controller in nodes:
        start_time[controller.name] = tobiko.time()
        if isinstance(disrupt_method, sh.RebootHostMethod):
            reboot_node(controller.name, wait=sequentially,
                        reboot_method=disrupt_method)
        else:
            # using ssh_client.connect we use a fire and forget reboot method
            controller.ssh_client.connect().exec_command(disrupt_method)
            LOG.info('disrupt exec: {} on server: {}'.format(disrupt_method,
                                                             controller.name))
            tobiko.cleanup_fixture(controller.ssh_client)
            if sequentially and is_network_disruption(disrupt_method):
                check_overcloud_node_uptime(
                    controller.ssh_client, start_time[controller.name])
        if sequentially:
            check_overcloud_node_responsive(controller)

    if not sequentially:
        for controller in nodes:
            if isinstance(disrupt_method, sh.RebootHostMethod) \
                    or is_network_disruption(disrupt_method):
                check_overcloud_node_uptime(
                    controller.ssh_client, start_time[controller.name])
            else:
                check_overcloud_node_responsive(controller)


def reboot_all_controller_nodes(reboot_method=sh.hard_reset_method,
                                sequentially=False, exclude_list=None):
    # reboot all controllers and wait for ssh Up on them
    # method : method of disruptino to use : hard or soft reset
    # hard reset is simultaneous while soft is sequential
    # exclude_list = list of nodes to NOT reset

    controlplane_groups = ['controller', 'messaging', 'database', 'networker']
    actual_controlplane_groups = tripleo_topology.actual_node_groups(
        controlplane_groups)
    nodes = topology.list_openstack_nodes(group=actual_controlplane_groups)

    # remove excluded nodes from reset list
    if exclude_list:
        nodes = [node for node in nodes if node.name not in exclude_list]

    start_time = {}
    for controller in nodes:
        start_time[controller.name] = tobiko.time()
        sh.reboot_host(ssh_client=controller.ssh_client, wait=sequentially,
                       method=reboot_method)
        LOG.info('reboot exec: {} on server: {}'.format(reboot_method,
                                                        controller.name))
        tobiko.cleanup_fixture(controller.ssh_client)
    if not sequentially:
        for controller in nodes:
            check_overcloud_node_uptime(
                controller.ssh_client, start_time[controller.name])


def is_ipv6addr_main_vip():
    """ Return True if main OC vip is ivp6 address. Otherwise return False."""
    main_vip = get_main_vip()
    return not netaddr.valid_ipv4(main_vip) and netaddr.valid_ipv6(main_vip)


def get_main_vip():
    """return the ip of the overcloud main vip.
    Retreive an ip address (ipv4/ipv6) from the auth_url."""
    auth_url = keystone.default_keystone_credentials().auth_url
    auth_url_parsed = urllib.parse.urlsplit(auth_url)
    main_vip = auth_url_parsed.hostname

    if not (netaddr.valid_ipv4(main_vip) or netaddr.valid_ipv6(main_vip)):
        try:
            # socket.gethostbyname translates hostname to IPv4 - it fails when
            # no IPv4 address is available
            main_vip = socket.gethostbyname(main_vip)
        except socket.gaierror:
            # the following method obtains an IPv6 from a hostname
            main_vip = socket.getaddrinfo(
                main_vip, None, socket.AF_INET6)[0][4][0]

    return main_vip


def get_main_vip_controller(main_vip):
    """return the controller hostname ,
    which is holding the main_vip pacemaker resource"""
    # when the main_vip is ipv6, the pacemaker command output replaces : by .
    # we need to adapt the value accordingly
    main_vim_controller = pacemaker.get_overcloud_nodes_running_pcs_resource(
        resource=f"ip-{main_vip.replace(':', '.')}")[0]
    return main_vim_controller


def delete_evacuable_tagged_image():
    # delete evacuable tagged image because it prevents
    # non tagged evacuations if exists
    for img in glance.list_images():
        if 'evacuable' in img['tags']:
            glance.delete_image(img.id)


def disrupt_controller_main_vip(disrupt_method=sh.hard_reset_method,
                                inverse=False):

    # reset the controller holding the main vip (os_auth_url)
    # ip resource (managed via pacemaker)
    # find main vip by getting it from
    main_vip = get_main_vip()

    # find the node holding that resource via :

    main_vip_controller = get_main_vip_controller(main_vip)

    if isinstance(disrupt_method, sh.RebootHostMethod):
        if inverse:
            reboot_all_controller_nodes(reboot_method=disrupt_method,
                                        exclude_list=[main_vip_controller])
        else:
            reboot_node(main_vip_controller, reboot_method=disrupt_method)
    else:
        if inverse:
            # inverse the nodes reset selection
            disrupt_all_controller_nodes(disrupt_method=disrupt_method,
                                         exclude_list=[main_vip_controller])
        else:
            # get that node's ssh_client and reset it
            disrupt_node(main_vip_controller, disrupt_method=disrupt_method)


def disrupt_controller_galera_main_vip(disrupt_method=sh.soft_reset_method):
    # This case reboots controller while VM creation is in progress
    # Please refer to RHBZ#2124877 for more info
    # Find the Galera VIP (port name : internal_api_virtual_ip)
    session = tripleo.undercloud_keystone_session()
    uc_neutron_client = neutron.get_neutron_client(session=session)
    try:
        new_port = neutron.find_port(client=uc_neutron_client, unique=False,
                                     name='internal_api_virtual_ip')
    except tobiko.ObjectNotFound as no_internal_api:
        raise tobiko.SkipException(
            'This OSP environment doesnt have an internal_api \
              network, so this test cannot be executed') from no_internal_api

    galera_vip_address = new_port['fixed_ips'][0]['ip_address']
    LOG.info("The Galera VIP address is: %r", galera_vip_address)
    # Find the controller hosting VIP resource
    galera_vip_resource = "ip-"+galera_vip_address
    galera_vip_controller = pacemaker.get_overcloud_nodes_running_pcs_resource(
                               resource=galera_vip_resource)[0]

    ports_before_stack_creation = neutron.list_ports(
        device_owner="compute:nova")
    multi_ip_test_fixture = tobiko.get_fixture(
        stacks.MultiIPCirrosServerStackFixture)
    tobiko.use_fixture(multi_ip_test_fixture)
    time.sleep(10)  # wait until some of the VMs have been created

    # Reboot that controller
    reboot_node(galera_vip_controller, wait=True,
                reboot_method=disrupt_method)

    return multi_ip_test_fixture, ports_before_stack_creation


def get_vms_detailed_info(multi_ip_test_fixture):
    for attempt in tobiko.retry(timeout=300, interval=10):
        # dynamically obtain the status of the VMs
        vms_detailed_info = multi_ip_test_fixture.vms_detailed_info

        vm_status_list = [
            vm.get('status') for vm in vms_detailed_info if vm is not None]
        if 'BUILD' not in vm_status_list:
            LOG.debug("All VMs reached a final status")
            break
        if attempt.is_last:
            LOG.warn("Still some VMs in status BUILD - the test continues...")
            break

    return vms_detailed_info


def check_no_duplicate_ips(vms_detailed_info, ports_before_stack_creation):
    test_case = tobiko.get_test_case()
    # check VM IP addresses are different
    ip4_list = []
    ip6_list = []
    for vm in vms_detailed_info:
        addresses = vm.get('addresses', {}) if vm is not None else {}
        # try to obtain the port associated to a VM from neutron if the VM
        # exists but vms_detailed_info does not show the port
        if not addresses and vm is not None:
            ports = neutron.list_ports(device_id=vm['id'],
                                       device_owner="compute:nova")
            test_case.assertLess(len(ports), 2)
            for port in ports:
                addresses[port['network_id']] = port['fixed_ips']

        for addresses_per_network in addresses.values():
            test_case.assertEqual(len(addresses_per_network), 2)
            for subnet_addr in addresses_per_network:
                # the subnet_addr dict is different depending on how it was
                # obtained: from vms_detailed_info or from neutron.list_ports
                subnet_ip = (subnet_addr.get('addr') or
                             subnet_addr.get('ip_address'))
                if netaddr.valid_ipv4(subnet_ip):
                    ip4_list.append(subnet_ip)
                elif netaddr.valid_ipv6(subnet_ip):
                    ip6_list.append(subnet_ip)

    ip4_set = set(ip4_list)  # this removes duplicate values
    LOG.debug("list of IPv4s from the MultiIPVM group: %r", ip4_list)
    test_case.assertEqual(len(ip4_list), len(ip4_set))

    ip6_set = set(ip6_list)  # this removes duplicate values
    LOG.debug("list of IPv6s from the MultiIPVM group: %r", ip6_list)
    test_case.assertEqual(len(ip6_list), len(ip6_set))

    LOG.debug("list of IPv4 and list of IPv6 addresses "
              "should have the same length")
    test_case.assertEqual(len(ip6_list), len(ip4_list))

    ports_after_reboot = neutron.list_ports(device_owner="compute:nova")
    ports_after_reboot_full = neutron.list_ports()
    LOG.debug(
        "list of compute:nova ports obtained at the begining of this test: %r",
        ports_before_stack_creation)
    LOG.debug(
        "list of compute:nova ports obtained at the end of this test: %r",
        ports_after_reboot)
    LOG.debug("full list of ports obtained at the end of this test: %r",
              ports_after_reboot_full)
    test_case.assertEqual(len(ip6_list), len(ports_after_reboot) - len(
        ports_before_stack_creation))


def reboot_controller_galera_main_vip():
    return disrupt_controller_galera_main_vip(
        disrupt_method=sh.soft_reset_method)


def reset_controller_main_vip():
    disrupt_controller_main_vip(disrupt_method=sh.hard_reset_method)


def reset_controllers_non_main_vip():
    disrupt_controller_main_vip(disrupt_method=sh.hard_reset_method,
                                inverse=True)


def crash_controller_main_vip():
    disrupt_controller_main_vip(disrupt_method=sh.crash_method)


def crash_controllers_non_main_vip():
    disrupt_controller_main_vip(disrupt_method=sh.crash_method,
                                inverse=True)


def network_disrupt_controller_main_vip():
    disrupt_controller_main_vip(disrupt_method=get_network_disruption(
                                is_ipv6addr_main_vip()))
    LOG.info('waiting 60s to avoid race conditions...')
    time.sleep(60.0)


def network_undisrupt_controller_main_vip():
    disrupt_controller_main_vip(disrupt_method=get_undisrupt_network(
                                is_ipv6addr_main_vip()))


def network_disrupt_controllers_non_main_vip():
    disrupt_controller_main_vip(disrupt_method=get_network_disruption(
                                is_ipv6addr_main_vip()),
                                inverse=True)


def network_undisrupt_controllers_non_main_vip():
    disrupt_controller_main_vip(disrupt_method=get_undisrupt_network(
                                is_ipv6addr_main_vip()),
                                inverse=True)


def is_network_disruption(disrupt_method):
    if re.search("ip6?tables", disrupt_method) \
                 and not re.search("restore", disrupt_method):
        return True
    else:
        return False


def get_network_disruption(isIpv6=False):
    """
    Param: isIpv6: boolean
    Return network_disruption(for ipv4) or network_disruption_ipv6 (which are
    iptables or ip6tables rules, correspondingly), according to the
    isIpv6 flag.
    """
    if isIpv6:
        return network_disruption_ipv6
    else:
        return network_disruption


def get_undisrupt_network(isIpv6=False):
    """
    Return undisrupt_network(for ipv4) or undisrupt_network_ipv6 method,
    according to the isIpv6 flag.
    """
    if isIpv6:
        return undisrupt_network_ipv6
    else:
        return undisrupt_network


def reset_all_compute_nodes(hard_reset=False, sequentially=False):

    # reboot all computes and wait for ssh Up on them
    if hard_reset:
        reset_method = sh.hard_reset_method
    else:
        reset_method = sh.soft_reset_method

    nodes = topology.list_openstack_nodes(group='compute')
    compute_reboot_operation_list = []
    for compute in nodes:
        # using ssh_client.connect we use a fire and forget reboot method
        # if sequentially, then wait and check uptime has changed
        # else, do not wait (uptime will be checked later)
        reboot_operation = sh.reboot_host(ssh_client=compute.ssh_client,
                                          wait=sequentially,
                                          method=reset_method)
        compute_reboot_operation_list.append(reboot_operation)
        LOG.info('reboot exec:  {} on server: {}'.format(reset_method,
                                                         compute.name))

    if not sequentially:
        for reboot_operation in compute_reboot_operation_list:
            # checking uptime on each compute - it should have been updated
            # after the reboot is done
            reboot_operation.wait_for_operation()
            LOG.info(f'{reboot_operation.hostname} is up')


def reset_ovndb_pcs_master_resource():
    """restart ovndb pacemaker resource
    this method only restart the resource running on the controller with is
    acting as Master"""
    node = pacemaker.get_overcloud_nodes_running_pcs_resource(
        resource_type='(ocf::ovn:ovndb-servers):', resource_state='Master')[0]
    ovn_db_pcs_master_resource_restart = (ovn_db_pcs_resource_restart + ' ' +
                                          node)
    disrupt_node(node, disrupt_method=ovn_db_pcs_master_resource_restart)


def reset_ovndb_pcs_resource():
    """restart ovndb pacemaker resource
    this method restart the whole resource, i.e. on all the controller nodes"""
    node = pacemaker.get_overcloud_nodes_running_pcs_resource(
        resource_type='(ocf::ovn:ovndb-servers):', resource_state='Master')[0]
    disrupt_node(node, disrupt_method=ovn_db_pcs_resource_restart)


def reset_ovndb_master_container():
    """get and restart the ovndb master container
    use of partial name :  resource: ovn-dbs-bundle-0 =>
    container: ovn-dbs-bundle-podman-0 or ovn-dbs-bundle-docker-0"""
    node = pacemaker.get_overcloud_nodes_running_pcs_resource(
        resource_type='(ocf::ovn:ovndb-servers):', resource_state='Master')[0]
    resource = pacemaker.get_overcloud_resource(
        resource_type='(ocf::ovn:ovndb-servers):', resource_state='Master')
    resource = resource[0].rsplit('-', 1)[0]
    containers.action_on_container('restart',
                                   partial_container_name=resource,
                                   container_host=node)


def restart_service_on_all_nodes(service):
    """restart the ovn bgp agent or the frr service from all the nodes where it
    is running and check the cloud is healthy after they are started again"""
    node_names = tripleo.get_overcloud_nodes_running_service(service)
    nodes = topology.list_openstack_nodes(hostnames=node_names)
    for node in nodes:
        sh.stop_systemd_units(service, ssh_client=node.ssh_client)
    for node in nodes:
        sh.start_systemd_units(service, ssh_client=node.ssh_client)


def kill_rabbitmq_service():
    """kill a rabbit process on a random controller,
    check in pacemaker it is down"""
    if tripleo_topology.is_composable_roles_env():
        nodes = topology.list_openstack_nodes(group='messaging')
    else:
        nodes = topology.list_openstack_nodes(group='controller')
    node = random.choice(nodes)
    sh.execute(kill_rabbit, ssh_client=node.ssh_client)
    LOG.info('kill rabbit: {} on server: {}'.format(kill_rabbit,
             node.name))
    retry = tobiko.retry(timeout=30, interval=5)
    for _ in retry:
        if not(pacemaker.PacemakerResourcesStatus().
               rabbitmq_resource_healthy()):
            return


def kill_all_galera_services():
    """kill all galera processes,
    check in pacemaker it is down"""
    if tripleo_topology.is_composable_roles_env():
        nodes = topology.list_openstack_nodes(group='database')
    else:
        nodes = topology.list_openstack_nodes(group='controller')
    for node in nodes:
        if topology.verify_osp_version('17.0', lower=True):
            sh.execute(kill_mysqld, ssh_client=node.ssh_client)
            LOG.info('kill galera: {} on server: {}'.format(kill_mysqld,
                                                            node.name))
        else:
            sh.execute(kill_mariadbd, ssh_client=node.ssh_client)
            LOG.info('kill galera: {} on server: {}'.format(kill_mariadbd,
                                                            node.name))
    retry = tobiko.retry(timeout=30, interval=5)
    for _ in retry:
        if not(pacemaker.PacemakerResourcesStatus().
               galera_resource_healthy()):
            return


def remove_all_grastate_galera():
    """shut down galera properly,
    remove all grastate"""
    if tripleo_topology.is_composable_roles_env():
        nodes = topology.list_openstack_nodes(group='database')
    else:
        nodes = topology.list_openstack_nodes(group='controller')
    LOG.info('shut down {} on all servers: {}'.format(
        pacemaker.GALERA_RESOURCE, nodes))
    if f"resource '{pacemaker.GALERA_RESOURCE}' is not running on any node" \
            not in pacemaker.run_pcs_resource_operation(
                pacemaker.GALERA_RESOURCE,
                pacemaker.DISABLE,
                nodes[0].ssh_client):
        raise PcsDisableException()
    for node in nodes:
        sh.execute(remove_grastate, ssh_client=node.ssh_client)

    LOG.info('enable back {} on all servers: {}'.format(
        pacemaker.GALERA_RESOURCE, nodes))
    if topology.verify_osp_version('17.0', lower=True):
        promoted = "master"
    else:
        promoted = "promoted"
    if f"resource '{pacemaker.GALERA_RESOURCE}' is {promoted} on node" not in \
            pacemaker.run_pcs_resource_operation(pacemaker.GALERA_RESOURCE,
                                                 pacemaker.ENABLE,
                                                 nodes[0].ssh_client,
                                                 operation_wait=90):
        raise PcsEnableException()


def remove_one_grastate_galera():
    """shut down galera properly,
    delete /var/lib/mysql/grastate.dat in a random node,
    check that bootstrap is done from a node with grastate"""
    if tripleo_topology.is_composable_roles_env():
        nodes = topology.list_openstack_nodes(group='database')
    else:
        nodes = topology.list_openstack_nodes(group='controller')
    node = random.choice(nodes)

    pcs_haproxy = pacemaker.HAPROXY_RESOURCE
    pcs_galera = pacemaker.GALERA_RESOURCE

    if not CONF.tobiko.rhosp.has_external_load_balancer:
        LOG.info(f'disable {pcs_haproxy}')
        if f"resource '{pcs_haproxy}' is not running on any node" not in \
                pacemaker.run_pcs_resource_operation(
                    pcs_haproxy,
                    pacemaker.DISABLE,
                    node.ssh_client,
                    operation_wait=30):
            raise PcsDisableException()
    else:
        LOG.debug(f'With Ext LB setups, {pcs_haproxy} is not deployed')

    LOG.info('shut down {} on all servers: {}'.format(pcs_galera, nodes))
    if f"resource '{pcs_galera}' is not running on any node" not in \
            pacemaker.run_pcs_resource_operation(
                pcs_galera,
                pacemaker.DISABLE,
                node.ssh_client):
        raise PcsDisableException()
    LOG.info('remove grastate: {} on server: {}'.format(remove_grastate,
                                                        node.name))
    sh.execute(remove_grastate, ssh_client=node.ssh_client)

    LOG.info('enable back {} on all servers: {}'.format(pcs_galera, nodes))
    if topology.verify_osp_version('17.0', lower=True):
        promoted = "master"
    else:
        promoted = "promoted"
    if f"resource '{pcs_galera}' is {promoted} on node" not in \
            pacemaker.run_pcs_resource_operation(
                pcs_galera, pacemaker.ENABLE, node.ssh_client,
                operation_wait=90):
        raise PcsEnableException()

    if not CONF.tobiko.rhosp.has_external_load_balancer:
        LOG.info(f'enable {pcs_haproxy}')
        if f"resource '{pcs_haproxy}' is running on node" not in \
                pacemaker.run_pcs_resource_operation(pcs_haproxy,
                                                     pacemaker.ENABLE,
                                                     node.ssh_client):
            raise PcsEnableException()
    else:
        LOG.debug(f'With Ext LB setups, {pcs_haproxy} is not deployed')

    # gcomm:// without args means that bootstrap is done from this node
    bootstrap = sh.execute(check_bootstrap, ssh_client=node.ssh_client).stdout
    if re.search('wsrep-cluster-address=gcomm://', bootstrap) is None:
        raise GaleraBoostrapException()
    lastDate = re.findall(r"\w{,3}\s*\w{,3}\s*\d{,2}\s*\d{,2}:\d{,2}:\d{,2}\s*"
                          r"\d{4}", bootstrap)[-1]
    return node, lastDate


def request_galera_sst():
    """remove_one_grastate_galera,
    check that sst is requested by a node with grastate"""
    node, date = remove_one_grastate_galera()
    bootstrapDate = datetime.strptime(date, '%a %b %d %H:%M:%S %Y')
    retry = tobiko.retry(timeout=30, interval=5)
    for _ in retry:
        sst_req = sh.execute(galera_sst_request,
                             ssh_client=node.ssh_client).stdout
        if sst_req:
            break
    sstDate = datetime.strptime(re.findall
                                (r"\d{4}-\d{,2}-\d{,2}\s*\d{,2}:\d{,2}:\d{,2}",
                                 sst_req)[-1], '%Y-%m-%d %H:%M:%S')
    if bootstrapDate > sstDate:
        raise TimestampException


def evac_failover_compute(compute_host, failover_type=sh.hard_reset_method):
    """disrupt a compute, to trigger it's instance-HA evacuation
    failover_type=hard_reset_method etc.."""
    if failover_type in (sh.hard_reset_method, sh.soft_reset_method):
        reboot_node(compute_host, reboot_method=failover_type)
    else:
        disrupt_node(compute_host, disrupt_method=failover_type)


def check_iha_evacuation(failover_type=None, vm_type=None):
    """check vms on compute host,disrupt compute host,
    check all vms evacuated and pingable"""
    for iteration in range(2):
        LOG.info(f'Begin IHA tests iteration {iteration}')
        LOG.info('create 2 vms')
        tests.test_servers_creation(number_of_servers=2)
        compute_host = nova.get_random_compute_with_vms_name()
        vms_starting_state_df = nova.get_compute_vms_df(compute_host)
        if vm_type == 'shutoff':
            nova.stop_all_instances()
        if vm_type == 'evac_image_vm':
            evac_vm_stack = tests.test_evacuable_server_creation()
            evac_vm_id = nova.get_stack_server_id(evac_vm_stack)
            org_nova_evac_df = nova.vm_df(evac_vm_id, nova.get_vms_table())
        if not vm_type == 'shutoff':
            nova.check_df_vms_ping(vms_starting_state_df)
        LOG.info(f'perform a failover on {compute_host}')
        evac_failover_compute(compute_host, failover_type=failover_type)
        test_cloud_recovery.overcloud_health_checks(passive_checks_only=True)
        if vm_type == 'evac_image_vm':
            nova.check_vm_evacuations(vms_df_old=org_nova_evac_df,
                                      compute_host=compute_host,
                                      timeout=600,
                                      check_no_evacuation=True)
            # delete evacuable tagged image because it prevents
            # non tagged evacuations if exists
            delete_evacuable_tagged_image()
            new_nova_evac_df = nova.vm_df(evac_vm_id, nova.get_vms_table())
            nova.check_vm_evacuations(org_nova_evac_df, new_nova_evac_df)
        else:
            nova.check_vm_evacuations(vms_df_old=vms_starting_state_df,
                                      compute_host=compute_host,
                                      timeout=600)
        LOG.info('check evac is Done')
        if not vm_type == 'shutoff':
            nova.check_df_vms_ping(vms_starting_state_df)


def check_iha_evacuation_evac_image_vm():
    check_iha_evacuation(failover_type=sh.hard_reset_method,
                         vm_type='evac_image_vm')


def check_iha_evacuation_hard_reset():
    check_iha_evacuation(failover_type=sh.hard_reset_method)


def check_iha_evacuation_network_disruption():
    check_iha_evacuation(failover_type=get_network_disruption(
                         is_ipv6addr_main_vip()))


def check_iha_evacuation_hard_reset_shutoff_instance():
    check_iha_evacuation(failover_type=sh.hard_reset_method, vm_type='shutoff')


def test_controllers_shutdown():
    test_case = tobiko.get_test_case()

    all_nodes = topology.list_openstack_nodes(group='controller')
    if len(all_nodes) < 3:
        tobiko.skip_test('It requires at least three controller nodes')

    all_node_names = [node.name for node in all_nodes]
    LOG.info("Ensure all controller nodes are running: "
             f"{all_node_names}")
    for node in all_nodes:
        node.power_on_node()
    topology.assert_reachable_nodes(all_nodes)

    LOG.debug('Check VM is running while all controllers nodes are on')
    nova_server = tobiko.setup_fixture(stacks.CirrosServerStackFixture)
    nova_server_ip = nova_server.ip_address
    ping.assert_reachable_hosts([nova_server_ip])

    quorum_level = math.ceil(0.5 * len(all_nodes))
    assert quorum_level >= len(all_nodes) - quorum_level
    nodes = random.sample(all_nodes, quorum_level)
    node_names = [node.name for node in nodes]
    LOG.info(f"Power off {quorum_level} random controller nodes: "
             f"{node_names}")
    for node in nodes:
        node.power_off_node()
        test_case.addCleanup(node.power_on_node)
    topology.assert_unreachable_nodes(nodes, retry_count=1)
    topology.assert_reachable_nodes(node
                                    for node in all_nodes
                                    if node not in nodes)

    LOG.debug('Check whenever VM is still running while some "'
              '"controllers nodes are off')
    reachable, unreachable = ping.ping_hosts([nova_server_ip],
                                             count=1)
    if reachable:
        LOG.debug(f"VM ips are reachable: {reachable}")
    if unreachable:
        LOG.debug(f"VM is are unreachable: {unreachable}")
    # TODO what do we expect here: VM reachable or unreachable?

    random.shuffle(nodes)
    LOG.info(f"Power on controller nodes: {node_names}")
    for node in nodes:
        node.power_on_node()

    LOG.debug("Check all controller nodes are running again: "
              f"{all_node_names}")
    topology.assert_reachable_nodes(all_nodes, retry_timeout=600.)

    LOG.debug('Check VM is running while all controllers nodes are on')
    ping.assert_reachable_hosts([nova_server_ip])