add IHA tests

Change-Id: I8d2740e877c607c3b225dd3475b4abc6f3420378
2020-06-15 16:33:31 +02:00 · 2020-06-15 16:33:31 +02:00 · 47bbb9523d
commit 47bbb9523d
parent f3a63b2d7e
4 changed files with 270 additions and 43 deletions
--- a/tobiko/tests/faults/ha/cloud_disruptions.py
+++ b/tobiko/tests/faults/ha/cloud_disruptions.py
@ -10,6 +10,7 @@ from tobiko.tripleo import topology as tripleo_topology
 from tobiko.openstack import keystone
 from tobiko.tripleo import pacemaker
 from tobiko.tripleo import containers
+from tobiko.tripleo import nova
 from oslo_log import log
 from tobiko.tests.faults.ha import test_cloud_recovery

@ -38,6 +39,7 @@ ovn_db_pcs_resource_restart = """sudo pcs resource restart ovn-dbs-bundle"""


 def get_node(node_name):
+    node_name = node_name.split('.')[0]
    return [node for node in topology.list_openstack_nodes() if
            node.name == node_name][0]

@ -65,7 +67,10 @@ def disrupt_node(node_name, disrupt_method=hard_reset_method):
    node.ssh_client.connect().exec_command(disrupt_method)
    LOG.info('disrupt exec: {} on server: {}'.format(disrupt_method,
                                                     node.name))
+    check_overcloud_node_responsive(node)

+
+def check_overcloud_node_responsive(node):
    node_checked = sh.execute("hostname",
                              ssh_client=node.ssh_client,
                              expect_exit_status=None).stdout
@ -117,10 +122,10 @@ def disrupt_all_controller_nodes(disrupt_method=hard_reset_method,
                                                         controller.name))
        tobiko.cleanup_fixture(controller.ssh_client)
        if sequentially:
-            test_cloud_recovery.check_overcloud_node_responsive(controller)
+            check_overcloud_node_responsive(controller)
    if not sequentially:
        for controller in topology.list_openstack_nodes(group='controller'):
-            test_cloud_recovery.check_overcloud_node_responsive(controller)
+            check_overcloud_node_responsive(controller)


 def get_main_vip():
@ -224,3 +229,58 @@ def reset_ovndb_master_container():
    containers.action_on_container('restart',
                                   partial_container_name=resource,
                                   container_host=node)
+
+
+def evac_failover_compute(compute_host, failover_type=hard_reset_method):
+    """disrupt a compute, to trigger it's instance-HA evacuation
+    failover_type=hard_reset_method etc.."""
+    reset_node(compute_host, disrupt_method=failover_type)
+
+
+def check_iha_evacuation(failover_type=None, vm_type=None):
+    """check vms on compute host,disrupt compute host,
+    check all vms evacuated and pingable"""
+    for iteration in range(2):
+        LOG.info(f'Beign IHA tests iteration {iteration}')
+        LOG.info('creatr 4 vms')
+        nova.create_multiple_unique_vms(n_vms=4)
+        compute_host = nova.get_random_compute_with_vms_name()
+        vms_starting_state_df = nova.get_compute_vms_df(compute_host)
+        if vm_type == 'shutoff':
+            nova.stop_all_instances()
+        if vm_type == 'evac_image_vm':
+            evac_vm_stack = nova.random_vm_create_evacuable_image_tag()
+            evac_vm_id = nova.get_stack_server_id(evac_vm_stack)
+            org_nova_evac_df = nova.vm_df(evac_vm_id, nova.get_vms_table())
+        nova.check_df_vms_ping(vms_starting_state_df)
+        LOG.info(f'perform a failover on {compute_host}')
+        evac_failover_compute(compute_host, failover_type=failover_type)
+        test_cloud_recovery.overcloud_health_checks(passive_checks_only=True)
+        vms_new_state_df = nova.get_compute_vms_df(compute_host)
+        if vm_type == 'evac_image_vm':
+            nova.check_vm_evacuations(vms_df_old=org_nova_evac_df,
+                                      vms_df_new=vms_new_state_df,
+                                      check_no_evacuation=True)
+            new_nova_evac_df = nova.vm_df(evac_vm_id, nova.get_vms_table())
+            nova.check_vm_evacuations(org_nova_evac_df, new_nova_evac_df)
+        LOG.info('check evac is Done')
+        nova.check_vm_evacuations(vms_df_old=vms_starting_state_df,
+                                  vms_df_new=vms_new_state_df)
+        nova.check_df_vms_ping(vms_starting_state_df)
+
+
+def check_iha_evacuation_evac_image_vm():
+    check_iha_evacuation(failover_type=hard_reset_method,
+                         vm_type='evac_image_vm')
+
+
+def check_iha_evacuation_hard_reset():
+    check_iha_evacuation(failover_type=hard_reset_method)
+
+
+def check_iha_evacuation_network_disruption():
+    check_iha_evacuation(failover_type=network_disruption)
+
+
+def check_iha_evacuation_hard_reset_shutfoff_inatance():
+    check_iha_evacuation(failover_type=hard_reset_method, vm_type='shutoff')
--- a/tobiko/tests/faults/ha/test_cloud_recovery.py
+++ b/tobiko/tests/faults/ha/test_cloud_recovery.py
@ -1,11 +1,6 @@
 from __future__ import absolute_import

-import random
-from oslo_log import log
-
 import testtools
-from tobiko.shell import ping
-from tobiko.shell import sh
 from tobiko.tests.faults.ha import cloud_disruptions
 from tobiko.tripleo import pacemaker
 from tobiko.tripleo import processes
@ -13,11 +8,6 @@ from tobiko.tripleo import containers
 from tobiko.tripleo import nova
 from tobiko.tripleo import neutron
 from tobiko.tripleo import undercloud
-from tobiko.openstack import stacks
-import tobiko
-
-
-LOG = log.getLogger(__name__)


 def overcloud_health_checks(passive_checks_only=False):
@ -29,37 +19,15 @@ def overcloud_health_checks(passive_checks_only=False):
    if not passive_checks_only:
        # create a uniq stack
        check_vm_create()
-    else:
-        # verify VM status is updated after reboot
-        nova.wait_for_all_instances_status('SHUTOFF')
-    nova.start_all_instances()
+        nova.start_all_instances()
    containers.list_node_containers.cache_clear()
    containers.assert_all_tripleo_containers_running()
    containers.assert_equal_containers_state()


 # check vm create with ssh and ping checks
-def check_vm_create(stack_name='stack{}'.format(random.randint(0, 1000000))):
-    """stack_name: unique stack name ,
-    so that each time a new vm is created"""
-    # create a vm
-    stack = stacks.CirrosServerStackFixture(
-        stack_name=stack_name)
-    tobiko.reset_fixture(stack)
-    stack.wait_for_create_complete()
-    # Test SSH connectivity to floating IP address
-    sh.get_hostname(ssh_client=stack.ssh_client)
-
-    # Test ICMP connectivity to floating IP address
-    ping.ping_until_received(
-        stack.floating_ip_address).assert_replied()
-
-
-def check_overcloud_node_responsive(node):
-    """wait until we get response for hostname command"""
-    hostname_check = sh.execute("hostname", ssh_client=node.ssh_client,
-                                expect_exit_status=None).stdout
-    LOG.info('{} is up '.format(hostname_check))
+def check_vm_create():
+    nova.random_vm_create()


 # check cluster failed statuses
@ -73,7 +41,7 @@ def check_overcloud_processes_health():


@undercloud.skip_if_missing_undercloud
-class RebootTripleoNodesTest(testtools.TestCase):
+class DisruptTripleoNodesTest(testtools.TestCase):

    """ HA Tests: run health check -> disruptive action -> health check
    disruptive_action: a function that runs some
@ -91,14 +59,11 @@ class RebootTripleoNodesTest(testtools.TestCase):
        cloud_disruptions.reset_all_controller_nodes()
        overcloud_health_checks()

-    def test_sequentially_hard_reboot_controllers_recovery(self):
-        overcloud_health_checks()
-        cloud_disruptions.reset_all_controller_nodes_sequentially()
-        overcloud_health_checks()
-
    def test_reboot_computes_recovery(self):
        overcloud_health_checks()
        cloud_disruptions.reset_all_compute_nodes(hard_reset=True)
+        # verify VM status is updated after reboot
+        nova.wait_for_all_instances_status('SHUTOFF')
        overcloud_health_checks(passive_checks_only=True)

    def test_reboot_controller_main_vip(self):
@ -132,6 +97,26 @@ class RebootTripleoNodesTest(testtools.TestCase):
        overcloud_health_checks()
        cloud_disruptions.reset_ovndb_master_container()
        overcloud_health_checks()
+
+    @pacemaker.skip_if_instanceha_not_delpoyed
+    def test_instanceha_evacuation_hard_reset(self):
+        overcloud_health_checks()
+        cloud_disruptions.check_iha_evacuation_hard_reset()
+
+    @pacemaker.skip_if_instanceha_not_delpoyed
+    def test_instanceha_evacuation_network_disruption(self):
+        overcloud_health_checks()
+        cloud_disruptions.check_iha_evacuation_network_disruption()
+
+    def test_instanceha_evacuation_hard_reset_shutfoff_inatance(self):
+        overcloud_health_checks()
+        cloud_disruptions.check_iha_evacuation_hard_reset_shutfoff_inatance()
+
+    def test_check_instanceha_evacuation_evac_image_vm(self):
+        overcloud_health_checks()
+        cloud_disruptions.check_iha_evacuation_evac_image_vm()
+
+
 # [..]
 # more tests to follow
 # run health checks
--- a/tobiko/tripleo/nova.py
+++ b/tobiko/tripleo/nova.py
@ -1,11 +1,17 @@
 from __future__ import absolute_import

+import random
 import time

 from oslo_log import log
+import pandas

 import tobiko
+from tobiko.shell import ping
 from tobiko.openstack import nova
+from tobiko.shell import sh
+from tobiko.openstack import stacks
+

 LOG = log.getLogger(__name__)

@ -54,6 +60,21 @@ def start_all_instances():
            tobiko.fail(instance_info)


+def stop_all_instances():
+    """try to start all stopped overcloud instances"""
+    for instance in nova.list_servers():
+        activated_instance = nova.shutoff_server(instance)
+        time.sleep(3)
+        instance_info = 'instance {nova_instance} is {state} on {host}'.format(
+            nova_instance=activated_instance.name,
+            state=activated_instance.status,
+            host=activated_instance._info[  # pylint: disable=W0212
+                'OS-EXT-SRV-ATTR:hypervisor_hostname'])
+        LOG.info(instance_info)
+        if activated_instance.status != 'SHUTOFF':
+            tobiko.fail(instance_info)
+
+
 def wait_for_all_instances_status(status, timeout=None):
    """wait for all instances for a certain status or raise an exception"""
    for instance in nova.list_servers():
@ -65,3 +86,150 @@ def wait_for_all_instances_status(status, timeout=None):
            host=instance._info[  # pylint: disable=W0212
                'OS-EXT-SRV-ATTR:hypervisor_hostname'])
        LOG.info(instance_info)
+
+
+def get_vms_table():
+    """populate a dataframe with vm host,id,status"""
+    vms_data = [(vm._info[  # pylint: disable=W0212
+                     'OS-EXT-SRV-ATTR:hypervisor_hostname'], vm.id,
+                 vm.status) for vm in nova.list_servers()]
+    vms_df = pandas.DataFrame(vms_data, columns=['vm_host', 'vm_id',
+                                                 'vm_state'])
+    return vms_df
+
+
+def list_computes():
+    """list compute host names"""
+    return [compute.hypervisor_hostname for compute in nova.list_hypervisors()]
+
+
+def get_compute_vms_df(compute_host):
+    """input: compute hostname (can be short)
+    output: dataframe with vms of that host"""
+    return get_vms_table().query(f"vm_host=='{compute_host}'")
+
+
+def get_random_compute_with_vms_name():
+    """get a randomcompute holding vm/s"""
+    for compute in list_computes():
+        if not get_compute_vms_df(compute).empty:
+            return compute
+
+
+def vm_info(vm_id, vms_df):
+    """input: vm and a vms df
+    output: host string"""
+    return vms_df.query(f"vm_id == '{vm_id}'").to_string()
+
+
+def vm_df(vm_id, vms_df):
+    """input: vm and a vms df
+    output: host string"""
+    return vms_df.query(f"vm_id == '{vm_id}'")
+
+
+def vm_floating_ip(vm_id):
+    """input: vm_id
+    output it's floating ip"""
+
+    vm = nova.get_server(vm_id)
+    floating_ip = nova.list_server_ip_addresses(
+        vm, address_type='floating').first
+    return floating_ip
+
+
+def check_ping_vm_fip(fip):
+    ping.ping_until_received(fip).assert_replied()
+
+
+def check_df_vms_ping(df):
+    """input: dataframe with vms_ids
+    try to ping all vms in df"""
+    for vm_id in df.vm_id.to_list():
+        check_ping_vm_fip(vm_floating_ip(vm_id))
+
+
+def vm_location(vm_id, vms_df):
+    """input: vm and a vms df
+    output: host string"""
+    return vms_df.query(f"vm_id == '{vm_id}'")['vm_host'].to_string(
+            index=False)
+
+
+def check_vm_evacuations(vms_df_old=None, vms_df_new=None, timeout=120,
+                         interval=2, check_no_evacuation=False):
+    """check evacuation of vms
+    input: old and new vms_state_tables dfs"""
+    failures = []
+    start = time.time()
+
+    while time.time() - start < timeout:
+        failures = []
+        for vm_id in vms_df_old.vm_id.to_list():
+            old_bm_host = vm_location(vm_id, vms_df_old)
+            new_vm_host = vm_location(vm_id, vms_df_new)
+
+            if check_no_evacuation:
+                cond = bool(old_bm_host != new_vm_host)
+            else:
+                cond = bool(old_bm_host == new_vm_host)
+
+            if cond:
+                failures.append(
+                    'failed vm evacuations: {}\n\n'.format(vm_info(vm_id,
+                                                           vms_df_old)))
+            if failures:
+                LOG.info('Failed nova evacuation:\n {}'.format(failures))
+                LOG.info('Not all nova vms evacuated ..')
+                LOG.info('Retrying , timeout at: {}'
+                         .format(timeout-(time.time() - start)))
+                time.sleep(interval)
+            else:
+                LOG.info(vms_df_old.to_string())
+                LOG.info('All vms were evacuated!')
+                return
+    # exhausted all retries
+    if failures:
+        tobiko.fail(
+            'failed vm evacuations:\n{!s}', '\n'.join(failures))
+
+
+# check vm create with ssh and ping checks
+def random_vm_create(stack_name='stack{}'.format(
+                    random.randint(0, 1000000)),
+                    stack_template=stacks.CirrosServerStackFixture):
+    """stack_name: unique stack name ,
+    so that each time a new vm is created"""
+    # create a vm
+    LOG.info(f'creating vm - {stack_name}')
+    stack = stack_template(
+        stack_name=stack_name)
+    tobiko.reset_fixture(stack)
+    stack.wait_for_create_complete()
+    tobiko.cleanup_fixture(stack.ssh_client)
+    # Test SSH connectivity to floating IP address
+    sh.get_hostname(ssh_client=stack.ssh_client)
+
+    # Test ICMP connectivity to floating IP address
+    ping.ping_until_received(
+        stack.floating_ip_address).assert_replied()
+    return stack
+
+
+def random_vm_create_evacuable_image_tag():
+    return random_vm_create(stack_template=stacks.EvacuableServerStackFixture)
+
+
+def random_vm_create_shutoff_state():
+    return nova.shutoff_server(random_vm_create().server_details)
+
+
+def get_stack_server_id(stack):
+    return stack.server_details.id
+
+
+def create_multiple_unique_vms(n_vms=2):
+    """create n_vms(int)"""
+    for _ in range(n_vms):
+        stack_name = 'stack{}'.format(random.randint(0, 1000000))
+        random_vm_create(stack_name)
--- a/tobiko/tripleo/pacemaker.py
+++ b/tobiko/tripleo/pacemaker.py
@ -286,3 +286,17 @@ def get_overcloud_resource(resource_type=None,
        pcs_df_query_resource_type = pcs_df.query(
            'resource_type=="{}"'.format(resource_type))
        return pcs_df_query_resource_type['resource'].unique().tolist()
+
+
+def instanceha_delpoyed():
+    """check IHA deployment
+    checks for existance of the nova-evacuate resource"""
+    if overcloud.has_overcloud():
+        return get_overcloud_nodes_running_pcs_resource(
+            resource='nova-evacuate')
+    else:
+        return False
+
+
+skip_if_instanceha_not_delpoyed = tobiko.skip_unless(
+    'instanceha not delpoyed', instanceha_delpoyed())