Merge "add IHA tests Change-Id: I8d2740e877c607c3b225dd3475b4abc6f3420378"

This commit is contained in:
Zuul 2020-06-20 00:37:54 +00:00 committed by Gerrit Code Review
commit 40cb580bd9
4 changed files with 270 additions and 43 deletions

View File

@ -10,6 +10,7 @@ from tobiko.tripleo import topology as tripleo_topology
from tobiko.openstack import keystone from tobiko.openstack import keystone
from tobiko.tripleo import pacemaker from tobiko.tripleo import pacemaker
from tobiko.tripleo import containers from tobiko.tripleo import containers
from tobiko.tripleo import nova
from oslo_log import log from oslo_log import log
from tobiko.tests.faults.ha import test_cloud_recovery from tobiko.tests.faults.ha import test_cloud_recovery
@ -38,6 +39,7 @@ ovn_db_pcs_resource_restart = """sudo pcs resource restart ovn-dbs-bundle"""
def get_node(node_name): def get_node(node_name):
node_name = node_name.split('.')[0]
return [node for node in topology.list_openstack_nodes() if return [node for node in topology.list_openstack_nodes() if
node.name == node_name][0] node.name == node_name][0]
@ -65,7 +67,10 @@ def disrupt_node(node_name, disrupt_method=hard_reset_method):
node.ssh_client.connect().exec_command(disrupt_method) node.ssh_client.connect().exec_command(disrupt_method)
LOG.info('disrupt exec: {} on server: {}'.format(disrupt_method, LOG.info('disrupt exec: {} on server: {}'.format(disrupt_method,
node.name)) node.name))
check_overcloud_node_responsive(node)
def check_overcloud_node_responsive(node):
node_checked = sh.execute("hostname", node_checked = sh.execute("hostname",
ssh_client=node.ssh_client, ssh_client=node.ssh_client,
expect_exit_status=None).stdout expect_exit_status=None).stdout
@ -117,10 +122,10 @@ def disrupt_all_controller_nodes(disrupt_method=hard_reset_method,
controller.name)) controller.name))
tobiko.cleanup_fixture(controller.ssh_client) tobiko.cleanup_fixture(controller.ssh_client)
if sequentially: if sequentially:
test_cloud_recovery.check_overcloud_node_responsive(controller) check_overcloud_node_responsive(controller)
if not sequentially: if not sequentially:
for controller in topology.list_openstack_nodes(group='controller'): for controller in topology.list_openstack_nodes(group='controller'):
test_cloud_recovery.check_overcloud_node_responsive(controller) check_overcloud_node_responsive(controller)
def get_main_vip(): def get_main_vip():
@ -224,3 +229,58 @@ def reset_ovndb_master_container():
containers.action_on_container('restart', containers.action_on_container('restart',
partial_container_name=resource, partial_container_name=resource,
container_host=node) container_host=node)
def evac_failover_compute(compute_host, failover_type=hard_reset_method):
"""disrupt a compute, to trigger it's instance-HA evacuation
failover_type=hard_reset_method etc.."""
reset_node(compute_host, disrupt_method=failover_type)
def check_iha_evacuation(failover_type=None, vm_type=None):
"""check vms on compute host,disrupt compute host,
check all vms evacuated and pingable"""
for iteration in range(2):
LOG.info(f'Beign IHA tests iteration {iteration}')
LOG.info('creatr 4 vms')
nova.create_multiple_unique_vms(n_vms=4)
compute_host = nova.get_random_compute_with_vms_name()
vms_starting_state_df = nova.get_compute_vms_df(compute_host)
if vm_type == 'shutoff':
nova.stop_all_instances()
if vm_type == 'evac_image_vm':
evac_vm_stack = nova.random_vm_create_evacuable_image_tag()
evac_vm_id = nova.get_stack_server_id(evac_vm_stack)
org_nova_evac_df = nova.vm_df(evac_vm_id, nova.get_vms_table())
nova.check_df_vms_ping(vms_starting_state_df)
LOG.info(f'perform a failover on {compute_host}')
evac_failover_compute(compute_host, failover_type=failover_type)
test_cloud_recovery.overcloud_health_checks(passive_checks_only=True)
vms_new_state_df = nova.get_compute_vms_df(compute_host)
if vm_type == 'evac_image_vm':
nova.check_vm_evacuations(vms_df_old=org_nova_evac_df,
vms_df_new=vms_new_state_df,
check_no_evacuation=True)
new_nova_evac_df = nova.vm_df(evac_vm_id, nova.get_vms_table())
nova.check_vm_evacuations(org_nova_evac_df, new_nova_evac_df)
LOG.info('check evac is Done')
nova.check_vm_evacuations(vms_df_old=vms_starting_state_df,
vms_df_new=vms_new_state_df)
nova.check_df_vms_ping(vms_starting_state_df)
def check_iha_evacuation_evac_image_vm():
check_iha_evacuation(failover_type=hard_reset_method,
vm_type='evac_image_vm')
def check_iha_evacuation_hard_reset():
check_iha_evacuation(failover_type=hard_reset_method)
def check_iha_evacuation_network_disruption():
check_iha_evacuation(failover_type=network_disruption)
def check_iha_evacuation_hard_reset_shutfoff_inatance():
check_iha_evacuation(failover_type=hard_reset_method, vm_type='shutoff')

View File

@ -1,11 +1,6 @@
from __future__ import absolute_import from __future__ import absolute_import
import random
from oslo_log import log
import testtools import testtools
from tobiko.shell import ping
from tobiko.shell import sh
from tobiko.tests.faults.ha import cloud_disruptions from tobiko.tests.faults.ha import cloud_disruptions
from tobiko.tripleo import pacemaker from tobiko.tripleo import pacemaker
from tobiko.tripleo import processes from tobiko.tripleo import processes
@ -13,11 +8,6 @@ from tobiko.tripleo import containers
from tobiko.tripleo import nova from tobiko.tripleo import nova
from tobiko.tripleo import neutron from tobiko.tripleo import neutron
from tobiko.tripleo import undercloud from tobiko.tripleo import undercloud
from tobiko.openstack import stacks
import tobiko
LOG = log.getLogger(__name__)
def overcloud_health_checks(passive_checks_only=False): def overcloud_health_checks(passive_checks_only=False):
@ -29,37 +19,15 @@ def overcloud_health_checks(passive_checks_only=False):
if not passive_checks_only: if not passive_checks_only:
# create a uniq stack # create a uniq stack
check_vm_create() check_vm_create()
else: nova.start_all_instances()
# verify VM status is updated after reboot
nova.wait_for_all_instances_status('SHUTOFF')
nova.start_all_instances()
containers.list_node_containers.cache_clear() containers.list_node_containers.cache_clear()
containers.assert_all_tripleo_containers_running() containers.assert_all_tripleo_containers_running()
containers.assert_equal_containers_state() containers.assert_equal_containers_state()
# check vm create with ssh and ping checks # check vm create with ssh and ping checks
def check_vm_create(stack_name='stack{}'.format(random.randint(0, 1000000))): def check_vm_create():
"""stack_name: unique stack name , nova.random_vm_create()
so that each time a new vm is created"""
# create a vm
stack = stacks.CirrosServerStackFixture(
stack_name=stack_name)
tobiko.reset_fixture(stack)
stack.wait_for_create_complete()
# Test SSH connectivity to floating IP address
sh.get_hostname(ssh_client=stack.ssh_client)
# Test ICMP connectivity to floating IP address
ping.ping_until_received(
stack.floating_ip_address).assert_replied()
def check_overcloud_node_responsive(node):
"""wait until we get response for hostname command"""
hostname_check = sh.execute("hostname", ssh_client=node.ssh_client,
expect_exit_status=None).stdout
LOG.info('{} is up '.format(hostname_check))
# check cluster failed statuses # check cluster failed statuses
@ -73,7 +41,7 @@ def check_overcloud_processes_health():
@undercloud.skip_if_missing_undercloud @undercloud.skip_if_missing_undercloud
class RebootTripleoNodesTest(testtools.TestCase): class DisruptTripleoNodesTest(testtools.TestCase):
""" HA Tests: run health check -> disruptive action -> health check """ HA Tests: run health check -> disruptive action -> health check
disruptive_action: a function that runs some disruptive_action: a function that runs some
@ -91,14 +59,11 @@ class RebootTripleoNodesTest(testtools.TestCase):
cloud_disruptions.reset_all_controller_nodes() cloud_disruptions.reset_all_controller_nodes()
overcloud_health_checks() overcloud_health_checks()
def test_sequentially_hard_reboot_controllers_recovery(self):
overcloud_health_checks()
cloud_disruptions.reset_all_controller_nodes_sequentially()
overcloud_health_checks()
def test_reboot_computes_recovery(self): def test_reboot_computes_recovery(self):
overcloud_health_checks() overcloud_health_checks()
cloud_disruptions.reset_all_compute_nodes(hard_reset=True) cloud_disruptions.reset_all_compute_nodes(hard_reset=True)
# verify VM status is updated after reboot
nova.wait_for_all_instances_status('SHUTOFF')
overcloud_health_checks(passive_checks_only=True) overcloud_health_checks(passive_checks_only=True)
def test_reboot_controller_main_vip(self): def test_reboot_controller_main_vip(self):
@ -132,6 +97,26 @@ class RebootTripleoNodesTest(testtools.TestCase):
overcloud_health_checks() overcloud_health_checks()
cloud_disruptions.reset_ovndb_master_container() cloud_disruptions.reset_ovndb_master_container()
overcloud_health_checks() overcloud_health_checks()
@pacemaker.skip_if_instanceha_not_delpoyed
def test_instanceha_evacuation_hard_reset(self):
overcloud_health_checks()
cloud_disruptions.check_iha_evacuation_hard_reset()
@pacemaker.skip_if_instanceha_not_delpoyed
def test_instanceha_evacuation_network_disruption(self):
overcloud_health_checks()
cloud_disruptions.check_iha_evacuation_network_disruption()
def test_instanceha_evacuation_hard_reset_shutfoff_inatance(self):
overcloud_health_checks()
cloud_disruptions.check_iha_evacuation_hard_reset_shutfoff_inatance()
def test_check_instanceha_evacuation_evac_image_vm(self):
overcloud_health_checks()
cloud_disruptions.check_iha_evacuation_evac_image_vm()
# [..] # [..]
# more tests to follow # more tests to follow
# run health checks # run health checks

View File

@ -1,11 +1,17 @@
from __future__ import absolute_import from __future__ import absolute_import
import random
import time import time
from oslo_log import log from oslo_log import log
import pandas
import tobiko import tobiko
from tobiko.shell import ping
from tobiko.openstack import nova from tobiko.openstack import nova
from tobiko.shell import sh
from tobiko.openstack import stacks
LOG = log.getLogger(__name__) LOG = log.getLogger(__name__)
@ -54,6 +60,21 @@ def start_all_instances():
tobiko.fail(instance_info) tobiko.fail(instance_info)
def stop_all_instances():
"""try to start all stopped overcloud instances"""
for instance in nova.list_servers():
activated_instance = nova.shutoff_server(instance)
time.sleep(3)
instance_info = 'instance {nova_instance} is {state} on {host}'.format(
nova_instance=activated_instance.name,
state=activated_instance.status,
host=activated_instance._info[ # pylint: disable=W0212
'OS-EXT-SRV-ATTR:hypervisor_hostname'])
LOG.info(instance_info)
if activated_instance.status != 'SHUTOFF':
tobiko.fail(instance_info)
def wait_for_all_instances_status(status, timeout=None): def wait_for_all_instances_status(status, timeout=None):
"""wait for all instances for a certain status or raise an exception""" """wait for all instances for a certain status or raise an exception"""
for instance in nova.list_servers(): for instance in nova.list_servers():
@ -65,3 +86,150 @@ def wait_for_all_instances_status(status, timeout=None):
host=instance._info[ # pylint: disable=W0212 host=instance._info[ # pylint: disable=W0212
'OS-EXT-SRV-ATTR:hypervisor_hostname']) 'OS-EXT-SRV-ATTR:hypervisor_hostname'])
LOG.info(instance_info) LOG.info(instance_info)
def get_vms_table():
"""populate a dataframe with vm host,id,status"""
vms_data = [(vm._info[ # pylint: disable=W0212
'OS-EXT-SRV-ATTR:hypervisor_hostname'], vm.id,
vm.status) for vm in nova.list_servers()]
vms_df = pandas.DataFrame(vms_data, columns=['vm_host', 'vm_id',
'vm_state'])
return vms_df
def list_computes():
"""list compute host names"""
return [compute.hypervisor_hostname for compute in nova.list_hypervisors()]
def get_compute_vms_df(compute_host):
"""input: compute hostname (can be short)
output: dataframe with vms of that host"""
return get_vms_table().query(f"vm_host=='{compute_host}'")
def get_random_compute_with_vms_name():
"""get a randomcompute holding vm/s"""
for compute in list_computes():
if not get_compute_vms_df(compute).empty:
return compute
def vm_info(vm_id, vms_df):
"""input: vm and a vms df
output: host string"""
return vms_df.query(f"vm_id == '{vm_id}'").to_string()
def vm_df(vm_id, vms_df):
"""input: vm and a vms df
output: host string"""
return vms_df.query(f"vm_id == '{vm_id}'")
def vm_floating_ip(vm_id):
"""input: vm_id
output it's floating ip"""
vm = nova.get_server(vm_id)
floating_ip = nova.list_server_ip_addresses(
vm, address_type='floating').first
return floating_ip
def check_ping_vm_fip(fip):
ping.ping_until_received(fip).assert_replied()
def check_df_vms_ping(df):
"""input: dataframe with vms_ids
try to ping all vms in df"""
for vm_id in df.vm_id.to_list():
check_ping_vm_fip(vm_floating_ip(vm_id))
def vm_location(vm_id, vms_df):
"""input: vm and a vms df
output: host string"""
return vms_df.query(f"vm_id == '{vm_id}'")['vm_host'].to_string(
index=False)
def check_vm_evacuations(vms_df_old=None, vms_df_new=None, timeout=120,
interval=2, check_no_evacuation=False):
"""check evacuation of vms
input: old and new vms_state_tables dfs"""
failures = []
start = time.time()
while time.time() - start < timeout:
failures = []
for vm_id in vms_df_old.vm_id.to_list():
old_bm_host = vm_location(vm_id, vms_df_old)
new_vm_host = vm_location(vm_id, vms_df_new)
if check_no_evacuation:
cond = bool(old_bm_host != new_vm_host)
else:
cond = bool(old_bm_host == new_vm_host)
if cond:
failures.append(
'failed vm evacuations: {}\n\n'.format(vm_info(vm_id,
vms_df_old)))
if failures:
LOG.info('Failed nova evacuation:\n {}'.format(failures))
LOG.info('Not all nova vms evacuated ..')
LOG.info('Retrying , timeout at: {}'
.format(timeout-(time.time() - start)))
time.sleep(interval)
else:
LOG.info(vms_df_old.to_string())
LOG.info('All vms were evacuated!')
return
# exhausted all retries
if failures:
tobiko.fail(
'failed vm evacuations:\n{!s}', '\n'.join(failures))
# check vm create with ssh and ping checks
def random_vm_create(stack_name='stack{}'.format(
random.randint(0, 1000000)),
stack_template=stacks.CirrosServerStackFixture):
"""stack_name: unique stack name ,
so that each time a new vm is created"""
# create a vm
LOG.info(f'creating vm - {stack_name}')
stack = stack_template(
stack_name=stack_name)
tobiko.reset_fixture(stack)
stack.wait_for_create_complete()
tobiko.cleanup_fixture(stack.ssh_client)
# Test SSH connectivity to floating IP address
sh.get_hostname(ssh_client=stack.ssh_client)
# Test ICMP connectivity to floating IP address
ping.ping_until_received(
stack.floating_ip_address).assert_replied()
return stack
def random_vm_create_evacuable_image_tag():
return random_vm_create(stack_template=stacks.EvacuableServerStackFixture)
def random_vm_create_shutoff_state():
return nova.shutoff_server(random_vm_create().server_details)
def get_stack_server_id(stack):
return stack.server_details.id
def create_multiple_unique_vms(n_vms=2):
"""create n_vms(int)"""
for _ in range(n_vms):
stack_name = 'stack{}'.format(random.randint(0, 1000000))
random_vm_create(stack_name)

View File

@ -286,3 +286,17 @@ def get_overcloud_resource(resource_type=None,
pcs_df_query_resource_type = pcs_df.query( pcs_df_query_resource_type = pcs_df.query(
'resource_type=="{}"'.format(resource_type)) 'resource_type=="{}"'.format(resource_type))
return pcs_df_query_resource_type['resource'].unique().tolist() return pcs_df_query_resource_type['resource'].unique().tolist()
def instanceha_delpoyed():
"""check IHA deployment
checks for existance of the nova-evacuate resource"""
if overcloud.has_overcloud():
return get_overcloud_nodes_running_pcs_resource(
resource='nova-evacuate')
else:
return False
skip_if_instanceha_not_delpoyed = tobiko.skip_unless(
'instanceha not delpoyed', instanceha_delpoyed())