Merge "add IHA tests Change-Id: I8d2740e877c607c3b225dd3475b4abc6f3420378"

2020-06-20 00:37:54 +00:00 · 2020-06-20 00:37:54 +00:00 · 40cb580bd9
commit 40cb580bd9
parent 50c9086349 47bbb9523d
4 changed files with 270 additions and 43 deletions
--- a/tobiko/tests/faults/ha/cloud_disruptions.py
+++ b/tobiko/tests/faults/ha/cloud_disruptions.py
@ -10,6 +10,7 @@ from tobiko.tripleo import topology as tripleo_topology
 from tobiko.openstack import keystone
 from tobiko.tripleo import pacemaker
 from tobiko.tripleo import containers
 from tobiko.tripleo import nova
 from oslo_log import log
 from tobiko.tests.faults.ha import test_cloud_recovery
@ -38,6 +39,7 @@ ovn_db_pcs_resource_restart = """sudo pcs resource restart ovn-dbs-bundle"""
 def get_node(node_name):
    node_name = node_name.split('.')[0]
    return [node for node in topology.list_openstack_nodes() if
            node.name == node_name][0]
@ -65,7 +67,10 @@ def disrupt_node(node_name, disrupt_method=hard_reset_method):
    node.ssh_client.connect().exec_command(disrupt_method)
    LOG.info('disrupt exec: {} on server: {}'.format(disrupt_method,
                                                     node.name))
    check_overcloud_node_responsive(node)
 def check_overcloud_node_responsive(node):
    node_checked = sh.execute("hostname",
                              ssh_client=node.ssh_client,
                              expect_exit_status=None).stdout
@ -117,10 +122,10 @@ def disrupt_all_controller_nodes(disrupt_method=hard_reset_method,
                                                         controller.name))
        tobiko.cleanup_fixture(controller.ssh_client)
        if sequentially:
-            test_cloud_recovery.check_overcloud_node_responsive(controller)
+            check_overcloud_node_responsive(controller)
    if not sequentially:
        for controller in topology.list_openstack_nodes(group='controller'):
-            test_cloud_recovery.check_overcloud_node_responsive(controller)
+            check_overcloud_node_responsive(controller)
 def get_main_vip():
@ -224,3 +229,58 @@ def reset_ovndb_master_container():
    containers.action_on_container('restart',
                                   partial_container_name=resource,
                                   container_host=node)
 def evac_failover_compute(compute_host, failover_type=hard_reset_method):
    """disrupt a compute, to trigger it's instance-HA evacuation
    failover_type=hard_reset_method etc.."""
    reset_node(compute_host, disrupt_method=failover_type)
 def check_iha_evacuation(failover_type=None, vm_type=None):
    """check vms on compute host,disrupt compute host,
    check all vms evacuated and pingable"""
    for iteration in range(2):
        LOG.info(f'Beign IHA tests iteration {iteration}')
        LOG.info('creatr 4 vms')
        nova.create_multiple_unique_vms(n_vms=4)
        compute_host = nova.get_random_compute_with_vms_name()
        vms_starting_state_df = nova.get_compute_vms_df(compute_host)
        if vm_type == 'shutoff':
            nova.stop_all_instances()
        if vm_type == 'evac_image_vm':
            evac_vm_stack = nova.random_vm_create_evacuable_image_tag()
            evac_vm_id = nova.get_stack_server_id(evac_vm_stack)
            org_nova_evac_df = nova.vm_df(evac_vm_id, nova.get_vms_table())
        nova.check_df_vms_ping(vms_starting_state_df)
        LOG.info(f'perform a failover on {compute_host}')
        evac_failover_compute(compute_host, failover_type=failover_type)
        test_cloud_recovery.overcloud_health_checks(passive_checks_only=True)
        vms_new_state_df = nova.get_compute_vms_df(compute_host)
        if vm_type == 'evac_image_vm':
            nova.check_vm_evacuations(vms_df_old=org_nova_evac_df,
                                      vms_df_new=vms_new_state_df,
                                      check_no_evacuation=True)
            new_nova_evac_df = nova.vm_df(evac_vm_id, nova.get_vms_table())
            nova.check_vm_evacuations(org_nova_evac_df, new_nova_evac_df)
        LOG.info('check evac is Done')
        nova.check_vm_evacuations(vms_df_old=vms_starting_state_df,
                                  vms_df_new=vms_new_state_df)
        nova.check_df_vms_ping(vms_starting_state_df)
 def check_iha_evacuation_evac_image_vm():
    check_iha_evacuation(failover_type=hard_reset_method,
                         vm_type='evac_image_vm')
 def check_iha_evacuation_hard_reset():
    check_iha_evacuation(failover_type=hard_reset_method)
 def check_iha_evacuation_network_disruption():
    check_iha_evacuation(failover_type=network_disruption)
 def check_iha_evacuation_hard_reset_shutfoff_inatance():
    check_iha_evacuation(failover_type=hard_reset_method, vm_type='shutoff')
--- a/tobiko/tests/faults/ha/test_cloud_recovery.py
+++ b/tobiko/tests/faults/ha/test_cloud_recovery.py
@ -1,11 +1,6 @@
 from __future__ import absolute_import
 import random
 from oslo_log import log
 import testtools
 from tobiko.shell import ping
 from tobiko.shell import sh
 from tobiko.tests.faults.ha import cloud_disruptions
 from tobiko.tripleo import pacemaker
 from tobiko.tripleo import processes
@ -13,11 +8,6 @@ from tobiko.tripleo import containers
 from tobiko.tripleo import nova
 from tobiko.tripleo import neutron
 from tobiko.tripleo import undercloud
 from tobiko.openstack import stacks
 import tobiko
 LOG = log.getLogger(__name__)
 def overcloud_health_checks(passive_checks_only=False):
@ -29,37 +19,15 @@ def overcloud_health_checks(passive_checks_only=False):
    if not passive_checks_only:
        # create a uniq stack
        check_vm_create()
-    else:
+        nova.start_all_instances()
        # verify VM status is updated after reboot
        nova.wait_for_all_instances_status('SHUTOFF')
    nova.start_all_instances()
    containers.list_node_containers.cache_clear()
    containers.assert_all_tripleo_containers_running()
    containers.assert_equal_containers_state()
 # check vm create with ssh and ping checks
-def check_vm_create(stack_name='stack{}'.format(random.randint(0, 1000000))):
+def check_vm_create():
-    """stack_name: unique stack name ,
+    nova.random_vm_create()
    so that each time a new vm is created"""
    # create a vm
    stack = stacks.CirrosServerStackFixture(
        stack_name=stack_name)
    tobiko.reset_fixture(stack)
    stack.wait_for_create_complete()
    # Test SSH connectivity to floating IP address
    sh.get_hostname(ssh_client=stack.ssh_client)
    # Test ICMP connectivity to floating IP address
    ping.ping_until_received(
        stack.floating_ip_address).assert_replied()
 def check_overcloud_node_responsive(node):
    """wait until we get response for hostname command"""
    hostname_check = sh.execute("hostname", ssh_client=node.ssh_client,
                                expect_exit_status=None).stdout
    LOG.info('{} is up '.format(hostname_check))
 # check cluster failed statuses
@ -73,7 +41,7 @@ def check_overcloud_processes_health():
@undercloud.skip_if_missing_undercloud
-class RebootTripleoNodesTest(testtools.TestCase):
+class DisruptTripleoNodesTest(testtools.TestCase):
    """ HA Tests: run health check -> disruptive action -> health check
    disruptive_action: a function that runs some
@ -91,14 +59,11 @@ class RebootTripleoNodesTest(testtools.TestCase):
        cloud_disruptions.reset_all_controller_nodes()
        overcloud_health_checks()
    def test_sequentially_hard_reboot_controllers_recovery(self):
        overcloud_health_checks()
        cloud_disruptions.reset_all_controller_nodes_sequentially()
        overcloud_health_checks()
    def test_reboot_computes_recovery(self):
        overcloud_health_checks()
        cloud_disruptions.reset_all_compute_nodes(hard_reset=True)
        # verify VM status is updated after reboot
        nova.wait_for_all_instances_status('SHUTOFF')
        overcloud_health_checks(passive_checks_only=True)
    def test_reboot_controller_main_vip(self):
@ -132,6 +97,26 @@ class RebootTripleoNodesTest(testtools.TestCase):
        overcloud_health_checks()
        cloud_disruptions.reset_ovndb_master_container()
        overcloud_health_checks()
    @pacemaker.skip_if_instanceha_not_delpoyed
    def test_instanceha_evacuation_hard_reset(self):
        overcloud_health_checks()
        cloud_disruptions.check_iha_evacuation_hard_reset()
    @pacemaker.skip_if_instanceha_not_delpoyed
    def test_instanceha_evacuation_network_disruption(self):
        overcloud_health_checks()
        cloud_disruptions.check_iha_evacuation_network_disruption()
    def test_instanceha_evacuation_hard_reset_shutfoff_inatance(self):
        overcloud_health_checks()
        cloud_disruptions.check_iha_evacuation_hard_reset_shutfoff_inatance()
    def test_check_instanceha_evacuation_evac_image_vm(self):
        overcloud_health_checks()
        cloud_disruptions.check_iha_evacuation_evac_image_vm()
 # [..]
 # more tests to follow
 # run health checks
--- a/tobiko/tripleo/nova.py
+++ b/tobiko/tripleo/nova.py
@ -1,11 +1,17 @@
 from __future__ import absolute_import
 import random
 import time
 from oslo_log import log
 import pandas
 import tobiko
 from tobiko.shell import ping
 from tobiko.openstack import nova
 from tobiko.shell import sh
 from tobiko.openstack import stacks
 LOG = log.getLogger(__name__)
@ -54,6 +60,21 @@ def start_all_instances():
            tobiko.fail(instance_info)
 def stop_all_instances():
    """try to start all stopped overcloud instances"""
    for instance in nova.list_servers():
        activated_instance = nova.shutoff_server(instance)
        time.sleep(3)
        instance_info = 'instance {nova_instance} is {state} on {host}'.format(
            nova_instance=activated_instance.name,
            state=activated_instance.status,
            host=activated_instance._info[  # pylint: disable=W0212
                'OS-EXT-SRV-ATTR:hypervisor_hostname'])
        LOG.info(instance_info)
        if activated_instance.status != 'SHUTOFF':
            tobiko.fail(instance_info)
 def wait_for_all_instances_status(status, timeout=None):
    """wait for all instances for a certain status or raise an exception"""
    for instance in nova.list_servers():
@ -65,3 +86,150 @@ def wait_for_all_instances_status(status, timeout=None):
            host=instance._info[  # pylint: disable=W0212
                'OS-EXT-SRV-ATTR:hypervisor_hostname'])
        LOG.info(instance_info)
 def get_vms_table():
    """populate a dataframe with vm host,id,status"""
    vms_data = [(vm._info[  # pylint: disable=W0212
                     'OS-EXT-SRV-ATTR:hypervisor_hostname'], vm.id,
                 vm.status) for vm in nova.list_servers()]
    vms_df = pandas.DataFrame(vms_data, columns=['vm_host', 'vm_id',
                                                 'vm_state'])
    return vms_df
 def list_computes():
    """list compute host names"""
    return [compute.hypervisor_hostname for compute in nova.list_hypervisors()]
 def get_compute_vms_df(compute_host):
    """input: compute hostname (can be short)
    output: dataframe with vms of that host"""
    return get_vms_table().query(f"vm_host=='{compute_host}'")
 def get_random_compute_with_vms_name():
    """get a randomcompute holding vm/s"""
    for compute in list_computes():
        if not get_compute_vms_df(compute).empty:
            return compute
 def vm_info(vm_id, vms_df):
    """input: vm and a vms df
    output: host string"""
    return vms_df.query(f"vm_id == '{vm_id}'").to_string()
 def vm_df(vm_id, vms_df):
    """input: vm and a vms df
    output: host string"""
    return vms_df.query(f"vm_id == '{vm_id}'")
 def vm_floating_ip(vm_id):
    """input: vm_id
    output it's floating ip"""
    vm = nova.get_server(vm_id)
    floating_ip = nova.list_server_ip_addresses(
        vm, address_type='floating').first
    return floating_ip
 def check_ping_vm_fip(fip):
    ping.ping_until_received(fip).assert_replied()
 def check_df_vms_ping(df):
    """input: dataframe with vms_ids
    try to ping all vms in df"""
    for vm_id in df.vm_id.to_list():
        check_ping_vm_fip(vm_floating_ip(vm_id))
 def vm_location(vm_id, vms_df):
    """input: vm and a vms df
    output: host string"""
    return vms_df.query(f"vm_id == '{vm_id}'")['vm_host'].to_string(
            index=False)
 def check_vm_evacuations(vms_df_old=None, vms_df_new=None, timeout=120,
                         interval=2, check_no_evacuation=False):
    """check evacuation of vms
    input: old and new vms_state_tables dfs"""
    failures = []
    start = time.time()
    while time.time() - start < timeout:
        failures = []
        for vm_id in vms_df_old.vm_id.to_list():
            old_bm_host = vm_location(vm_id, vms_df_old)
            new_vm_host = vm_location(vm_id, vms_df_new)
            if check_no_evacuation:
                cond = bool(old_bm_host != new_vm_host)
            else:
                cond = bool(old_bm_host == new_vm_host)
            if cond:
                failures.append(
                    'failed vm evacuations: {}\n\n'.format(vm_info(vm_id,
                                                           vms_df_old)))
            if failures:
                LOG.info('Failed nova evacuation:\n {}'.format(failures))
                LOG.info('Not all nova vms evacuated ..')
                LOG.info('Retrying , timeout at: {}'
                         .format(timeout-(time.time() - start)))
                time.sleep(interval)
            else:
                LOG.info(vms_df_old.to_string())
                LOG.info('All vms were evacuated!')
                return
    # exhausted all retries
    if failures:
        tobiko.fail(
            'failed vm evacuations:\n{!s}', '\n'.join(failures))
 # check vm create with ssh and ping checks
 def random_vm_create(stack_name='stack{}'.format(
                    random.randint(0, 1000000)),
                    stack_template=stacks.CirrosServerStackFixture):
    """stack_name: unique stack name ,
    so that each time a new vm is created"""
    # create a vm
    LOG.info(f'creating vm - {stack_name}')
    stack = stack_template(
        stack_name=stack_name)
    tobiko.reset_fixture(stack)
    stack.wait_for_create_complete()
    tobiko.cleanup_fixture(stack.ssh_client)
    # Test SSH connectivity to floating IP address
    sh.get_hostname(ssh_client=stack.ssh_client)
    # Test ICMP connectivity to floating IP address
    ping.ping_until_received(
        stack.floating_ip_address).assert_replied()
    return stack
 def random_vm_create_evacuable_image_tag():
    return random_vm_create(stack_template=stacks.EvacuableServerStackFixture)
 def random_vm_create_shutoff_state():
    return nova.shutoff_server(random_vm_create().server_details)
 def get_stack_server_id(stack):
    return stack.server_details.id
 def create_multiple_unique_vms(n_vms=2):
    """create n_vms(int)"""
    for _ in range(n_vms):
        stack_name = 'stack{}'.format(random.randint(0, 1000000))
        random_vm_create(stack_name)
--- a/tobiko/tripleo/pacemaker.py
+++ b/tobiko/tripleo/pacemaker.py
@ -286,3 +286,17 @@ def get_overcloud_resource(resource_type=None,
        pcs_df_query_resource_type = pcs_df.query(
            'resource_type=="{}"'.format(resource_type))
        return pcs_df_query_resource_type['resource'].unique().tolist()
 def instanceha_delpoyed():
    """check IHA deployment
    checks for existance of the nova-evacuate resource"""
    if overcloud.has_overcloud():
        return get_overcloud_nodes_running_pcs_resource(
            resource='nova-evacuate')
    else:
        return False
 skip_if_instanceha_not_delpoyed = tobiko.skip_unless(
    'instanceha not delpoyed', instanceha_delpoyed())