Merge "add IHA tests Change-Id: I8d2740e877c607c3b225dd3475b4abc6f3420378"
This commit is contained in:
commit
40cb580bd9
@ -10,6 +10,7 @@ from tobiko.tripleo import topology as tripleo_topology
|
||||
from tobiko.openstack import keystone
|
||||
from tobiko.tripleo import pacemaker
|
||||
from tobiko.tripleo import containers
|
||||
from tobiko.tripleo import nova
|
||||
from oslo_log import log
|
||||
from tobiko.tests.faults.ha import test_cloud_recovery
|
||||
|
||||
@ -38,6 +39,7 @@ ovn_db_pcs_resource_restart = """sudo pcs resource restart ovn-dbs-bundle"""
|
||||
|
||||
|
||||
def get_node(node_name):
|
||||
node_name = node_name.split('.')[0]
|
||||
return [node for node in topology.list_openstack_nodes() if
|
||||
node.name == node_name][0]
|
||||
|
||||
@ -65,7 +67,10 @@ def disrupt_node(node_name, disrupt_method=hard_reset_method):
|
||||
node.ssh_client.connect().exec_command(disrupt_method)
|
||||
LOG.info('disrupt exec: {} on server: {}'.format(disrupt_method,
|
||||
node.name))
|
||||
check_overcloud_node_responsive(node)
|
||||
|
||||
|
||||
def check_overcloud_node_responsive(node):
|
||||
node_checked = sh.execute("hostname",
|
||||
ssh_client=node.ssh_client,
|
||||
expect_exit_status=None).stdout
|
||||
@ -117,10 +122,10 @@ def disrupt_all_controller_nodes(disrupt_method=hard_reset_method,
|
||||
controller.name))
|
||||
tobiko.cleanup_fixture(controller.ssh_client)
|
||||
if sequentially:
|
||||
test_cloud_recovery.check_overcloud_node_responsive(controller)
|
||||
check_overcloud_node_responsive(controller)
|
||||
if not sequentially:
|
||||
for controller in topology.list_openstack_nodes(group='controller'):
|
||||
test_cloud_recovery.check_overcloud_node_responsive(controller)
|
||||
check_overcloud_node_responsive(controller)
|
||||
|
||||
|
||||
def get_main_vip():
|
||||
@ -224,3 +229,58 @@ def reset_ovndb_master_container():
|
||||
containers.action_on_container('restart',
|
||||
partial_container_name=resource,
|
||||
container_host=node)
|
||||
|
||||
|
||||
def evac_failover_compute(compute_host, failover_type=hard_reset_method):
|
||||
"""disrupt a compute, to trigger it's instance-HA evacuation
|
||||
failover_type=hard_reset_method etc.."""
|
||||
reset_node(compute_host, disrupt_method=failover_type)
|
||||
|
||||
|
||||
def check_iha_evacuation(failover_type=None, vm_type=None):
|
||||
"""check vms on compute host,disrupt compute host,
|
||||
check all vms evacuated and pingable"""
|
||||
for iteration in range(2):
|
||||
LOG.info(f'Beign IHA tests iteration {iteration}')
|
||||
LOG.info('creatr 4 vms')
|
||||
nova.create_multiple_unique_vms(n_vms=4)
|
||||
compute_host = nova.get_random_compute_with_vms_name()
|
||||
vms_starting_state_df = nova.get_compute_vms_df(compute_host)
|
||||
if vm_type == 'shutoff':
|
||||
nova.stop_all_instances()
|
||||
if vm_type == 'evac_image_vm':
|
||||
evac_vm_stack = nova.random_vm_create_evacuable_image_tag()
|
||||
evac_vm_id = nova.get_stack_server_id(evac_vm_stack)
|
||||
org_nova_evac_df = nova.vm_df(evac_vm_id, nova.get_vms_table())
|
||||
nova.check_df_vms_ping(vms_starting_state_df)
|
||||
LOG.info(f'perform a failover on {compute_host}')
|
||||
evac_failover_compute(compute_host, failover_type=failover_type)
|
||||
test_cloud_recovery.overcloud_health_checks(passive_checks_only=True)
|
||||
vms_new_state_df = nova.get_compute_vms_df(compute_host)
|
||||
if vm_type == 'evac_image_vm':
|
||||
nova.check_vm_evacuations(vms_df_old=org_nova_evac_df,
|
||||
vms_df_new=vms_new_state_df,
|
||||
check_no_evacuation=True)
|
||||
new_nova_evac_df = nova.vm_df(evac_vm_id, nova.get_vms_table())
|
||||
nova.check_vm_evacuations(org_nova_evac_df, new_nova_evac_df)
|
||||
LOG.info('check evac is Done')
|
||||
nova.check_vm_evacuations(vms_df_old=vms_starting_state_df,
|
||||
vms_df_new=vms_new_state_df)
|
||||
nova.check_df_vms_ping(vms_starting_state_df)
|
||||
|
||||
|
||||
def check_iha_evacuation_evac_image_vm():
|
||||
check_iha_evacuation(failover_type=hard_reset_method,
|
||||
vm_type='evac_image_vm')
|
||||
|
||||
|
||||
def check_iha_evacuation_hard_reset():
|
||||
check_iha_evacuation(failover_type=hard_reset_method)
|
||||
|
||||
|
||||
def check_iha_evacuation_network_disruption():
|
||||
check_iha_evacuation(failover_type=network_disruption)
|
||||
|
||||
|
||||
def check_iha_evacuation_hard_reset_shutfoff_inatance():
|
||||
check_iha_evacuation(failover_type=hard_reset_method, vm_type='shutoff')
|
||||
|
@ -1,11 +1,6 @@
|
||||
from __future__ import absolute_import
|
||||
|
||||
import random
|
||||
from oslo_log import log
|
||||
|
||||
import testtools
|
||||
from tobiko.shell import ping
|
||||
from tobiko.shell import sh
|
||||
from tobiko.tests.faults.ha import cloud_disruptions
|
||||
from tobiko.tripleo import pacemaker
|
||||
from tobiko.tripleo import processes
|
||||
@ -13,11 +8,6 @@ from tobiko.tripleo import containers
|
||||
from tobiko.tripleo import nova
|
||||
from tobiko.tripleo import neutron
|
||||
from tobiko.tripleo import undercloud
|
||||
from tobiko.openstack import stacks
|
||||
import tobiko
|
||||
|
||||
|
||||
LOG = log.getLogger(__name__)
|
||||
|
||||
|
||||
def overcloud_health_checks(passive_checks_only=False):
|
||||
@ -29,37 +19,15 @@ def overcloud_health_checks(passive_checks_only=False):
|
||||
if not passive_checks_only:
|
||||
# create a uniq stack
|
||||
check_vm_create()
|
||||
else:
|
||||
# verify VM status is updated after reboot
|
||||
nova.wait_for_all_instances_status('SHUTOFF')
|
||||
nova.start_all_instances()
|
||||
nova.start_all_instances()
|
||||
containers.list_node_containers.cache_clear()
|
||||
containers.assert_all_tripleo_containers_running()
|
||||
containers.assert_equal_containers_state()
|
||||
|
||||
|
||||
# check vm create with ssh and ping checks
|
||||
def check_vm_create(stack_name='stack{}'.format(random.randint(0, 1000000))):
|
||||
"""stack_name: unique stack name ,
|
||||
so that each time a new vm is created"""
|
||||
# create a vm
|
||||
stack = stacks.CirrosServerStackFixture(
|
||||
stack_name=stack_name)
|
||||
tobiko.reset_fixture(stack)
|
||||
stack.wait_for_create_complete()
|
||||
# Test SSH connectivity to floating IP address
|
||||
sh.get_hostname(ssh_client=stack.ssh_client)
|
||||
|
||||
# Test ICMP connectivity to floating IP address
|
||||
ping.ping_until_received(
|
||||
stack.floating_ip_address).assert_replied()
|
||||
|
||||
|
||||
def check_overcloud_node_responsive(node):
|
||||
"""wait until we get response for hostname command"""
|
||||
hostname_check = sh.execute("hostname", ssh_client=node.ssh_client,
|
||||
expect_exit_status=None).stdout
|
||||
LOG.info('{} is up '.format(hostname_check))
|
||||
def check_vm_create():
|
||||
nova.random_vm_create()
|
||||
|
||||
|
||||
# check cluster failed statuses
|
||||
@ -73,7 +41,7 @@ def check_overcloud_processes_health():
|
||||
|
||||
|
||||
@undercloud.skip_if_missing_undercloud
|
||||
class RebootTripleoNodesTest(testtools.TestCase):
|
||||
class DisruptTripleoNodesTest(testtools.TestCase):
|
||||
|
||||
""" HA Tests: run health check -> disruptive action -> health check
|
||||
disruptive_action: a function that runs some
|
||||
@ -91,14 +59,11 @@ class RebootTripleoNodesTest(testtools.TestCase):
|
||||
cloud_disruptions.reset_all_controller_nodes()
|
||||
overcloud_health_checks()
|
||||
|
||||
def test_sequentially_hard_reboot_controllers_recovery(self):
|
||||
overcloud_health_checks()
|
||||
cloud_disruptions.reset_all_controller_nodes_sequentially()
|
||||
overcloud_health_checks()
|
||||
|
||||
def test_reboot_computes_recovery(self):
|
||||
overcloud_health_checks()
|
||||
cloud_disruptions.reset_all_compute_nodes(hard_reset=True)
|
||||
# verify VM status is updated after reboot
|
||||
nova.wait_for_all_instances_status('SHUTOFF')
|
||||
overcloud_health_checks(passive_checks_only=True)
|
||||
|
||||
def test_reboot_controller_main_vip(self):
|
||||
@ -132,6 +97,26 @@ class RebootTripleoNodesTest(testtools.TestCase):
|
||||
overcloud_health_checks()
|
||||
cloud_disruptions.reset_ovndb_master_container()
|
||||
overcloud_health_checks()
|
||||
|
||||
@pacemaker.skip_if_instanceha_not_delpoyed
|
||||
def test_instanceha_evacuation_hard_reset(self):
|
||||
overcloud_health_checks()
|
||||
cloud_disruptions.check_iha_evacuation_hard_reset()
|
||||
|
||||
@pacemaker.skip_if_instanceha_not_delpoyed
|
||||
def test_instanceha_evacuation_network_disruption(self):
|
||||
overcloud_health_checks()
|
||||
cloud_disruptions.check_iha_evacuation_network_disruption()
|
||||
|
||||
def test_instanceha_evacuation_hard_reset_shutfoff_inatance(self):
|
||||
overcloud_health_checks()
|
||||
cloud_disruptions.check_iha_evacuation_hard_reset_shutfoff_inatance()
|
||||
|
||||
def test_check_instanceha_evacuation_evac_image_vm(self):
|
||||
overcloud_health_checks()
|
||||
cloud_disruptions.check_iha_evacuation_evac_image_vm()
|
||||
|
||||
|
||||
# [..]
|
||||
# more tests to follow
|
||||
# run health checks
|
||||
|
@ -1,11 +1,17 @@
|
||||
from __future__ import absolute_import
|
||||
|
||||
import random
|
||||
import time
|
||||
|
||||
from oslo_log import log
|
||||
import pandas
|
||||
|
||||
import tobiko
|
||||
from tobiko.shell import ping
|
||||
from tobiko.openstack import nova
|
||||
from tobiko.shell import sh
|
||||
from tobiko.openstack import stacks
|
||||
|
||||
|
||||
LOG = log.getLogger(__name__)
|
||||
|
||||
@ -54,6 +60,21 @@ def start_all_instances():
|
||||
tobiko.fail(instance_info)
|
||||
|
||||
|
||||
def stop_all_instances():
|
||||
"""try to start all stopped overcloud instances"""
|
||||
for instance in nova.list_servers():
|
||||
activated_instance = nova.shutoff_server(instance)
|
||||
time.sleep(3)
|
||||
instance_info = 'instance {nova_instance} is {state} on {host}'.format(
|
||||
nova_instance=activated_instance.name,
|
||||
state=activated_instance.status,
|
||||
host=activated_instance._info[ # pylint: disable=W0212
|
||||
'OS-EXT-SRV-ATTR:hypervisor_hostname'])
|
||||
LOG.info(instance_info)
|
||||
if activated_instance.status != 'SHUTOFF':
|
||||
tobiko.fail(instance_info)
|
||||
|
||||
|
||||
def wait_for_all_instances_status(status, timeout=None):
|
||||
"""wait for all instances for a certain status or raise an exception"""
|
||||
for instance in nova.list_servers():
|
||||
@ -65,3 +86,150 @@ def wait_for_all_instances_status(status, timeout=None):
|
||||
host=instance._info[ # pylint: disable=W0212
|
||||
'OS-EXT-SRV-ATTR:hypervisor_hostname'])
|
||||
LOG.info(instance_info)
|
||||
|
||||
|
||||
def get_vms_table():
|
||||
"""populate a dataframe with vm host,id,status"""
|
||||
vms_data = [(vm._info[ # pylint: disable=W0212
|
||||
'OS-EXT-SRV-ATTR:hypervisor_hostname'], vm.id,
|
||||
vm.status) for vm in nova.list_servers()]
|
||||
vms_df = pandas.DataFrame(vms_data, columns=['vm_host', 'vm_id',
|
||||
'vm_state'])
|
||||
return vms_df
|
||||
|
||||
|
||||
def list_computes():
|
||||
"""list compute host names"""
|
||||
return [compute.hypervisor_hostname for compute in nova.list_hypervisors()]
|
||||
|
||||
|
||||
def get_compute_vms_df(compute_host):
|
||||
"""input: compute hostname (can be short)
|
||||
output: dataframe with vms of that host"""
|
||||
return get_vms_table().query(f"vm_host=='{compute_host}'")
|
||||
|
||||
|
||||
def get_random_compute_with_vms_name():
|
||||
"""get a randomcompute holding vm/s"""
|
||||
for compute in list_computes():
|
||||
if not get_compute_vms_df(compute).empty:
|
||||
return compute
|
||||
|
||||
|
||||
def vm_info(vm_id, vms_df):
|
||||
"""input: vm and a vms df
|
||||
output: host string"""
|
||||
return vms_df.query(f"vm_id == '{vm_id}'").to_string()
|
||||
|
||||
|
||||
def vm_df(vm_id, vms_df):
|
||||
"""input: vm and a vms df
|
||||
output: host string"""
|
||||
return vms_df.query(f"vm_id == '{vm_id}'")
|
||||
|
||||
|
||||
def vm_floating_ip(vm_id):
|
||||
"""input: vm_id
|
||||
output it's floating ip"""
|
||||
|
||||
vm = nova.get_server(vm_id)
|
||||
floating_ip = nova.list_server_ip_addresses(
|
||||
vm, address_type='floating').first
|
||||
return floating_ip
|
||||
|
||||
|
||||
def check_ping_vm_fip(fip):
|
||||
ping.ping_until_received(fip).assert_replied()
|
||||
|
||||
|
||||
def check_df_vms_ping(df):
|
||||
"""input: dataframe with vms_ids
|
||||
try to ping all vms in df"""
|
||||
for vm_id in df.vm_id.to_list():
|
||||
check_ping_vm_fip(vm_floating_ip(vm_id))
|
||||
|
||||
|
||||
def vm_location(vm_id, vms_df):
|
||||
"""input: vm and a vms df
|
||||
output: host string"""
|
||||
return vms_df.query(f"vm_id == '{vm_id}'")['vm_host'].to_string(
|
||||
index=False)
|
||||
|
||||
|
||||
def check_vm_evacuations(vms_df_old=None, vms_df_new=None, timeout=120,
|
||||
interval=2, check_no_evacuation=False):
|
||||
"""check evacuation of vms
|
||||
input: old and new vms_state_tables dfs"""
|
||||
failures = []
|
||||
start = time.time()
|
||||
|
||||
while time.time() - start < timeout:
|
||||
failures = []
|
||||
for vm_id in vms_df_old.vm_id.to_list():
|
||||
old_bm_host = vm_location(vm_id, vms_df_old)
|
||||
new_vm_host = vm_location(vm_id, vms_df_new)
|
||||
|
||||
if check_no_evacuation:
|
||||
cond = bool(old_bm_host != new_vm_host)
|
||||
else:
|
||||
cond = bool(old_bm_host == new_vm_host)
|
||||
|
||||
if cond:
|
||||
failures.append(
|
||||
'failed vm evacuations: {}\n\n'.format(vm_info(vm_id,
|
||||
vms_df_old)))
|
||||
if failures:
|
||||
LOG.info('Failed nova evacuation:\n {}'.format(failures))
|
||||
LOG.info('Not all nova vms evacuated ..')
|
||||
LOG.info('Retrying , timeout at: {}'
|
||||
.format(timeout-(time.time() - start)))
|
||||
time.sleep(interval)
|
||||
else:
|
||||
LOG.info(vms_df_old.to_string())
|
||||
LOG.info('All vms were evacuated!')
|
||||
return
|
||||
# exhausted all retries
|
||||
if failures:
|
||||
tobiko.fail(
|
||||
'failed vm evacuations:\n{!s}', '\n'.join(failures))
|
||||
|
||||
|
||||
# check vm create with ssh and ping checks
|
||||
def random_vm_create(stack_name='stack{}'.format(
|
||||
random.randint(0, 1000000)),
|
||||
stack_template=stacks.CirrosServerStackFixture):
|
||||
"""stack_name: unique stack name ,
|
||||
so that each time a new vm is created"""
|
||||
# create a vm
|
||||
LOG.info(f'creating vm - {stack_name}')
|
||||
stack = stack_template(
|
||||
stack_name=stack_name)
|
||||
tobiko.reset_fixture(stack)
|
||||
stack.wait_for_create_complete()
|
||||
tobiko.cleanup_fixture(stack.ssh_client)
|
||||
# Test SSH connectivity to floating IP address
|
||||
sh.get_hostname(ssh_client=stack.ssh_client)
|
||||
|
||||
# Test ICMP connectivity to floating IP address
|
||||
ping.ping_until_received(
|
||||
stack.floating_ip_address).assert_replied()
|
||||
return stack
|
||||
|
||||
|
||||
def random_vm_create_evacuable_image_tag():
|
||||
return random_vm_create(stack_template=stacks.EvacuableServerStackFixture)
|
||||
|
||||
|
||||
def random_vm_create_shutoff_state():
|
||||
return nova.shutoff_server(random_vm_create().server_details)
|
||||
|
||||
|
||||
def get_stack_server_id(stack):
|
||||
return stack.server_details.id
|
||||
|
||||
|
||||
def create_multiple_unique_vms(n_vms=2):
|
||||
"""create n_vms(int)"""
|
||||
for _ in range(n_vms):
|
||||
stack_name = 'stack{}'.format(random.randint(0, 1000000))
|
||||
random_vm_create(stack_name)
|
||||
|
@ -286,3 +286,17 @@ def get_overcloud_resource(resource_type=None,
|
||||
pcs_df_query_resource_type = pcs_df.query(
|
||||
'resource_type=="{}"'.format(resource_type))
|
||||
return pcs_df_query_resource_type['resource'].unique().tolist()
|
||||
|
||||
|
||||
def instanceha_delpoyed():
|
||||
"""check IHA deployment
|
||||
checks for existance of the nova-evacuate resource"""
|
||||
if overcloud.has_overcloud():
|
||||
return get_overcloud_nodes_running_pcs_resource(
|
||||
resource='nova-evacuate')
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
skip_if_instanceha_not_delpoyed = tobiko.skip_unless(
|
||||
'instanceha not delpoyed', instanceha_delpoyed())
|
||||
|
Loading…
Reference in New Issue
Block a user