Merge "add IHA tests Change-Id: I8d2740e877c607c3b225dd3475b4abc6f3420378"

This commit is contained in:
Zuul 2020-06-20 00:37:54 +00:00 committed by Gerrit Code Review
commit 40cb580bd9
4 changed files with 270 additions and 43 deletions

View File

@ -10,6 +10,7 @@ from tobiko.tripleo import topology as tripleo_topology
from tobiko.openstack import keystone
from tobiko.tripleo import pacemaker
from tobiko.tripleo import containers
from tobiko.tripleo import nova
from oslo_log import log
from tobiko.tests.faults.ha import test_cloud_recovery
@ -38,6 +39,7 @@ ovn_db_pcs_resource_restart = """sudo pcs resource restart ovn-dbs-bundle"""
def get_node(node_name):
node_name = node_name.split('.')[0]
return [node for node in topology.list_openstack_nodes() if
node.name == node_name][0]
@ -65,7 +67,10 @@ def disrupt_node(node_name, disrupt_method=hard_reset_method):
node.ssh_client.connect().exec_command(disrupt_method)
LOG.info('disrupt exec: {} on server: {}'.format(disrupt_method,
node.name))
check_overcloud_node_responsive(node)
def check_overcloud_node_responsive(node):
node_checked = sh.execute("hostname",
ssh_client=node.ssh_client,
expect_exit_status=None).stdout
@ -117,10 +122,10 @@ def disrupt_all_controller_nodes(disrupt_method=hard_reset_method,
controller.name))
tobiko.cleanup_fixture(controller.ssh_client)
if sequentially:
test_cloud_recovery.check_overcloud_node_responsive(controller)
check_overcloud_node_responsive(controller)
if not sequentially:
for controller in topology.list_openstack_nodes(group='controller'):
test_cloud_recovery.check_overcloud_node_responsive(controller)
check_overcloud_node_responsive(controller)
def get_main_vip():
@ -224,3 +229,58 @@ def reset_ovndb_master_container():
containers.action_on_container('restart',
partial_container_name=resource,
container_host=node)
def evac_failover_compute(compute_host, failover_type=hard_reset_method):
"""disrupt a compute, to trigger it's instance-HA evacuation
failover_type=hard_reset_method etc.."""
reset_node(compute_host, disrupt_method=failover_type)
def check_iha_evacuation(failover_type=None, vm_type=None):
"""check vms on compute host,disrupt compute host,
check all vms evacuated and pingable"""
for iteration in range(2):
LOG.info(f'Beign IHA tests iteration {iteration}')
LOG.info('creatr 4 vms')
nova.create_multiple_unique_vms(n_vms=4)
compute_host = nova.get_random_compute_with_vms_name()
vms_starting_state_df = nova.get_compute_vms_df(compute_host)
if vm_type == 'shutoff':
nova.stop_all_instances()
if vm_type == 'evac_image_vm':
evac_vm_stack = nova.random_vm_create_evacuable_image_tag()
evac_vm_id = nova.get_stack_server_id(evac_vm_stack)
org_nova_evac_df = nova.vm_df(evac_vm_id, nova.get_vms_table())
nova.check_df_vms_ping(vms_starting_state_df)
LOG.info(f'perform a failover on {compute_host}')
evac_failover_compute(compute_host, failover_type=failover_type)
test_cloud_recovery.overcloud_health_checks(passive_checks_only=True)
vms_new_state_df = nova.get_compute_vms_df(compute_host)
if vm_type == 'evac_image_vm':
nova.check_vm_evacuations(vms_df_old=org_nova_evac_df,
vms_df_new=vms_new_state_df,
check_no_evacuation=True)
new_nova_evac_df = nova.vm_df(evac_vm_id, nova.get_vms_table())
nova.check_vm_evacuations(org_nova_evac_df, new_nova_evac_df)
LOG.info('check evac is Done')
nova.check_vm_evacuations(vms_df_old=vms_starting_state_df,
vms_df_new=vms_new_state_df)
nova.check_df_vms_ping(vms_starting_state_df)
def check_iha_evacuation_evac_image_vm():
check_iha_evacuation(failover_type=hard_reset_method,
vm_type='evac_image_vm')
def check_iha_evacuation_hard_reset():
check_iha_evacuation(failover_type=hard_reset_method)
def check_iha_evacuation_network_disruption():
check_iha_evacuation(failover_type=network_disruption)
def check_iha_evacuation_hard_reset_shutfoff_inatance():
check_iha_evacuation(failover_type=hard_reset_method, vm_type='shutoff')

View File

@ -1,11 +1,6 @@
from __future__ import absolute_import
import random
from oslo_log import log
import testtools
from tobiko.shell import ping
from tobiko.shell import sh
from tobiko.tests.faults.ha import cloud_disruptions
from tobiko.tripleo import pacemaker
from tobiko.tripleo import processes
@ -13,11 +8,6 @@ from tobiko.tripleo import containers
from tobiko.tripleo import nova
from tobiko.tripleo import neutron
from tobiko.tripleo import undercloud
from tobiko.openstack import stacks
import tobiko
LOG = log.getLogger(__name__)
def overcloud_health_checks(passive_checks_only=False):
@ -29,37 +19,15 @@ def overcloud_health_checks(passive_checks_only=False):
if not passive_checks_only:
# create a uniq stack
check_vm_create()
else:
# verify VM status is updated after reboot
nova.wait_for_all_instances_status('SHUTOFF')
nova.start_all_instances()
nova.start_all_instances()
containers.list_node_containers.cache_clear()
containers.assert_all_tripleo_containers_running()
containers.assert_equal_containers_state()
# check vm create with ssh and ping checks
def check_vm_create(stack_name='stack{}'.format(random.randint(0, 1000000))):
"""stack_name: unique stack name ,
so that each time a new vm is created"""
# create a vm
stack = stacks.CirrosServerStackFixture(
stack_name=stack_name)
tobiko.reset_fixture(stack)
stack.wait_for_create_complete()
# Test SSH connectivity to floating IP address
sh.get_hostname(ssh_client=stack.ssh_client)
# Test ICMP connectivity to floating IP address
ping.ping_until_received(
stack.floating_ip_address).assert_replied()
def check_overcloud_node_responsive(node):
"""wait until we get response for hostname command"""
hostname_check = sh.execute("hostname", ssh_client=node.ssh_client,
expect_exit_status=None).stdout
LOG.info('{} is up '.format(hostname_check))
def check_vm_create():
nova.random_vm_create()
# check cluster failed statuses
@ -73,7 +41,7 @@ def check_overcloud_processes_health():
@undercloud.skip_if_missing_undercloud
class RebootTripleoNodesTest(testtools.TestCase):
class DisruptTripleoNodesTest(testtools.TestCase):
""" HA Tests: run health check -> disruptive action -> health check
disruptive_action: a function that runs some
@ -91,14 +59,11 @@ class RebootTripleoNodesTest(testtools.TestCase):
cloud_disruptions.reset_all_controller_nodes()
overcloud_health_checks()
def test_sequentially_hard_reboot_controllers_recovery(self):
overcloud_health_checks()
cloud_disruptions.reset_all_controller_nodes_sequentially()
overcloud_health_checks()
def test_reboot_computes_recovery(self):
overcloud_health_checks()
cloud_disruptions.reset_all_compute_nodes(hard_reset=True)
# verify VM status is updated after reboot
nova.wait_for_all_instances_status('SHUTOFF')
overcloud_health_checks(passive_checks_only=True)
def test_reboot_controller_main_vip(self):
@ -132,6 +97,26 @@ class RebootTripleoNodesTest(testtools.TestCase):
overcloud_health_checks()
cloud_disruptions.reset_ovndb_master_container()
overcloud_health_checks()
@pacemaker.skip_if_instanceha_not_delpoyed
def test_instanceha_evacuation_hard_reset(self):
overcloud_health_checks()
cloud_disruptions.check_iha_evacuation_hard_reset()
@pacemaker.skip_if_instanceha_not_delpoyed
def test_instanceha_evacuation_network_disruption(self):
overcloud_health_checks()
cloud_disruptions.check_iha_evacuation_network_disruption()
def test_instanceha_evacuation_hard_reset_shutfoff_inatance(self):
overcloud_health_checks()
cloud_disruptions.check_iha_evacuation_hard_reset_shutfoff_inatance()
def test_check_instanceha_evacuation_evac_image_vm(self):
overcloud_health_checks()
cloud_disruptions.check_iha_evacuation_evac_image_vm()
# [..]
# more tests to follow
# run health checks

View File

@ -1,11 +1,17 @@
from __future__ import absolute_import
import random
import time
from oslo_log import log
import pandas
import tobiko
from tobiko.shell import ping
from tobiko.openstack import nova
from tobiko.shell import sh
from tobiko.openstack import stacks
LOG = log.getLogger(__name__)
@ -54,6 +60,21 @@ def start_all_instances():
tobiko.fail(instance_info)
def stop_all_instances():
"""try to start all stopped overcloud instances"""
for instance in nova.list_servers():
activated_instance = nova.shutoff_server(instance)
time.sleep(3)
instance_info = 'instance {nova_instance} is {state} on {host}'.format(
nova_instance=activated_instance.name,
state=activated_instance.status,
host=activated_instance._info[ # pylint: disable=W0212
'OS-EXT-SRV-ATTR:hypervisor_hostname'])
LOG.info(instance_info)
if activated_instance.status != 'SHUTOFF':
tobiko.fail(instance_info)
def wait_for_all_instances_status(status, timeout=None):
"""wait for all instances for a certain status or raise an exception"""
for instance in nova.list_servers():
@ -65,3 +86,150 @@ def wait_for_all_instances_status(status, timeout=None):
host=instance._info[ # pylint: disable=W0212
'OS-EXT-SRV-ATTR:hypervisor_hostname'])
LOG.info(instance_info)
def get_vms_table():
"""populate a dataframe with vm host,id,status"""
vms_data = [(vm._info[ # pylint: disable=W0212
'OS-EXT-SRV-ATTR:hypervisor_hostname'], vm.id,
vm.status) for vm in nova.list_servers()]
vms_df = pandas.DataFrame(vms_data, columns=['vm_host', 'vm_id',
'vm_state'])
return vms_df
def list_computes():
"""list compute host names"""
return [compute.hypervisor_hostname for compute in nova.list_hypervisors()]
def get_compute_vms_df(compute_host):
"""input: compute hostname (can be short)
output: dataframe with vms of that host"""
return get_vms_table().query(f"vm_host=='{compute_host}'")
def get_random_compute_with_vms_name():
"""get a randomcompute holding vm/s"""
for compute in list_computes():
if not get_compute_vms_df(compute).empty:
return compute
def vm_info(vm_id, vms_df):
"""input: vm and a vms df
output: host string"""
return vms_df.query(f"vm_id == '{vm_id}'").to_string()
def vm_df(vm_id, vms_df):
"""input: vm and a vms df
output: host string"""
return vms_df.query(f"vm_id == '{vm_id}'")
def vm_floating_ip(vm_id):
"""input: vm_id
output it's floating ip"""
vm = nova.get_server(vm_id)
floating_ip = nova.list_server_ip_addresses(
vm, address_type='floating').first
return floating_ip
def check_ping_vm_fip(fip):
ping.ping_until_received(fip).assert_replied()
def check_df_vms_ping(df):
"""input: dataframe with vms_ids
try to ping all vms in df"""
for vm_id in df.vm_id.to_list():
check_ping_vm_fip(vm_floating_ip(vm_id))
def vm_location(vm_id, vms_df):
"""input: vm and a vms df
output: host string"""
return vms_df.query(f"vm_id == '{vm_id}'")['vm_host'].to_string(
index=False)
def check_vm_evacuations(vms_df_old=None, vms_df_new=None, timeout=120,
interval=2, check_no_evacuation=False):
"""check evacuation of vms
input: old and new vms_state_tables dfs"""
failures = []
start = time.time()
while time.time() - start < timeout:
failures = []
for vm_id in vms_df_old.vm_id.to_list():
old_bm_host = vm_location(vm_id, vms_df_old)
new_vm_host = vm_location(vm_id, vms_df_new)
if check_no_evacuation:
cond = bool(old_bm_host != new_vm_host)
else:
cond = bool(old_bm_host == new_vm_host)
if cond:
failures.append(
'failed vm evacuations: {}\n\n'.format(vm_info(vm_id,
vms_df_old)))
if failures:
LOG.info('Failed nova evacuation:\n {}'.format(failures))
LOG.info('Not all nova vms evacuated ..')
LOG.info('Retrying , timeout at: {}'
.format(timeout-(time.time() - start)))
time.sleep(interval)
else:
LOG.info(vms_df_old.to_string())
LOG.info('All vms were evacuated!')
return
# exhausted all retries
if failures:
tobiko.fail(
'failed vm evacuations:\n{!s}', '\n'.join(failures))
# check vm create with ssh and ping checks
def random_vm_create(stack_name='stack{}'.format(
random.randint(0, 1000000)),
stack_template=stacks.CirrosServerStackFixture):
"""stack_name: unique stack name ,
so that each time a new vm is created"""
# create a vm
LOG.info(f'creating vm - {stack_name}')
stack = stack_template(
stack_name=stack_name)
tobiko.reset_fixture(stack)
stack.wait_for_create_complete()
tobiko.cleanup_fixture(stack.ssh_client)
# Test SSH connectivity to floating IP address
sh.get_hostname(ssh_client=stack.ssh_client)
# Test ICMP connectivity to floating IP address
ping.ping_until_received(
stack.floating_ip_address).assert_replied()
return stack
def random_vm_create_evacuable_image_tag():
return random_vm_create(stack_template=stacks.EvacuableServerStackFixture)
def random_vm_create_shutoff_state():
return nova.shutoff_server(random_vm_create().server_details)
def get_stack_server_id(stack):
return stack.server_details.id
def create_multiple_unique_vms(n_vms=2):
"""create n_vms(int)"""
for _ in range(n_vms):
stack_name = 'stack{}'.format(random.randint(0, 1000000))
random_vm_create(stack_name)

View File

@ -286,3 +286,17 @@ def get_overcloud_resource(resource_type=None,
pcs_df_query_resource_type = pcs_df.query(
'resource_type=="{}"'.format(resource_type))
return pcs_df_query_resource_type['resource'].unique().tolist()
def instanceha_delpoyed():
"""check IHA deployment
checks for existance of the nova-evacuate resource"""
if overcloud.has_overcloud():
return get_overcloud_nodes_running_pcs_resource(
resource='nova-evacuate')
else:
return False
skip_if_instanceha_not_delpoyed = tobiko.skip_unless(
'instanceha not delpoyed', instanceha_delpoyed())