tobiko/tobiko/tripleo/nova.py
r 47ab780209 add all vm running virsh check
Change-Id: Ic3a6ca1949a24656d45e2216d66d9f3d298d0079
2020-12-14 20:57:55 +02:00

221 lines
7.4 KiB
Python

from __future__ import absolute_import
import time
import typing # noqa
from oslo_log import log
import pandas
import tobiko
from tobiko.tripleo import overcloud
from tobiko.shell import ping
from tobiko.shell import sh
from tobiko.openstack import nova
from tobiko.openstack import topology
from tobiko.tripleo import containers
LOG = log.getLogger(__name__)
def check_nova_services_health(timeout=600., interval=2.):
retry = tobiko.retry(timeout=timeout, interval=interval)
nova.wait_for_services_up(retry=retry)
def start_all_instances():
"""try to start all stopped overcloud instances"""
for instance in nova.list_servers():
activated_instance = nova.activate_server(instance)
time.sleep(3)
instance_info = 'instance {nova_instance} is {state} on {host}'.format(
nova_instance=activated_instance.name,
state=activated_instance.status,
host=activated_instance._info[ # pylint: disable=W0212
'OS-EXT-SRV-ATTR:hypervisor_hostname'])
LOG.info(instance_info)
if activated_instance.status != 'ACTIVE':
tobiko.fail(instance_info)
def stop_all_instances():
"""try to start all stopped overcloud instances"""
for instance in nova.list_servers():
activated_instance = nova.shutoff_server(instance)
time.sleep(3)
instance_info = 'instance {nova_instance} is {state} on {host}'.format(
nova_instance=activated_instance.name,
state=activated_instance.status,
host=activated_instance._info[ # pylint: disable=W0212
'OS-EXT-SRV-ATTR:hypervisor_hostname'])
LOG.info(instance_info)
if activated_instance.status != 'SHUTOFF':
tobiko.fail(instance_info)
def wait_for_all_instances_status(status, timeout=None):
"""wait for all instances for a certain status or raise an exception"""
for instance in nova.list_servers():
nova.wait_for_server_status(server=instance.id, status=status,
timeout=timeout)
instance_info = 'instance {nova_instance} is {state} on {host}'.format(
nova_instance=instance.name,
state=status,
host=instance._info[ # pylint: disable=W0212
'OS-EXT-SRV-ATTR:hypervisor_hostname'])
LOG.info(instance_info)
def get_vms_table():
"""populate a dataframe with vm host,id,status"""
vms_data = [(vm._info[ # pylint: disable=W0212
'OS-EXT-SRV-ATTR:hypervisor_hostname'], vm.id,
vm.status) for vm in nova.list_servers()]
vms_df = pandas.DataFrame(vms_data, columns=['vm_host', 'vm_id',
'vm_state'])
return vms_df
def list_computes():
"""list compute host names"""
return [compute.hypervisor_hostname for compute in nova.list_hypervisors()]
def get_compute_vms_df(compute_host):
"""input: compute hostname (can be short)
output: dataframe with vms of that host"""
return get_vms_table().query(f"vm_host=='{compute_host}'")
def get_random_compute_with_vms_name():
"""get a randomcompute holding vm/s"""
for compute in list_computes():
if not get_compute_vms_df(compute).empty:
return compute
def vm_info(vm_id, vms_df):
"""input: vm and a vms df
output: host string"""
return vms_df.query(f"vm_id == '{vm_id}'").to_string()
def vm_df(vm_id, vms_df):
"""input: vm and a vms df
output: host string"""
return vms_df.query(f"vm_id == '{vm_id}'")
def vm_floating_ip(vm_id):
"""input: vm_id
output it's floating ip"""
vm = nova.get_server(vm_id)
floating_ip = nova.list_server_ip_addresses(
vm, address_type='floating').first
return floating_ip
def check_ping_vm_fip(fip):
ping.ping_until_received(fip).assert_replied()
def check_df_vms_ping(df):
"""input: dataframe with vms_ids
try to ping all vms in df"""
for vm_id in df.vm_id.to_list():
check_ping_vm_fip(vm_floating_ip(vm_id))
def vm_location(vm_id, vms_df):
"""input: vm and a vms df
output: host string"""
return vms_df.query(f"vm_id == '{vm_id}'")['vm_host'].to_string(
index=False)
def check_vm_evacuations(vms_df_old=None, compute_host=None, timeout=600,
interval=2, check_no_evacuation=False):
"""check evacuation of vms
input: old and new vms_state_tables dfs"""
failures = []
start = time.time()
while time.time() - start < timeout:
failures = []
vms_df_new = get_compute_vms_df(compute_host)
for vm_id in vms_df_old.vm_id.to_list():
old_bm_host = vm_location(vm_id, vms_df_old)
new_vm_host = vm_location(vm_id, vms_df_new)
if check_no_evacuation:
cond = bool(old_bm_host != new_vm_host)
else:
cond = bool(old_bm_host == new_vm_host)
if cond:
failures.append(
'failed vm evacuations: {}\n\n'.format(vm_info(vm_id,
vms_df_old)))
if failures:
LOG.info('Failed nova evacuation:\n {}'.format(failures))
LOG.info('Not all nova vms evacuated ..')
LOG.info('Retrying , timeout at: {}'
.format(timeout-(time.time() - start)))
time.sleep(interval)
else:
LOG.info(vms_df_old.to_string())
LOG.info('All vms were evacuated!')
return
# exhausted all retries
if failures:
tobiko.fail(
'failed vm evacuations:\n{!s}', '\n'.join(failures))
def get_stack_server_id(stack):
return stack.server_details.id
def get_fqdn_from_topology_node(topology_node):
return sh.execute("hostname -f", ssh_client=topology_node.ssh_client,
expect_exit_status=None).stdout.strip()
def check_vm_running_via_virsh(topology_compute, vm_id):
"""check that a vm is in running state via virsh command,
return false if not"""
if vm_id in get_vm_uuid_list_running_via_virsh(topology_compute):
return True
else:
return False
def get_vm_uuid_list_running_via_virsh(topology_compute):
if overcloud.has_overcloud():
container_runtime = containers.get_container_runtime_name()
command = f"sudo {container_runtime} exec nova_libvirt " \
f"sh -c 'for i in `virsh list --name --state-running` " \
f";do virsh domuuid $i;done'"
else:
command = "for i in `sudo virsh list --name --state-running` " \
";do virsh domuuid $i;done'"
return sh.execute(command,
ssh_client=topology_compute.ssh_client).stdout.split()
def check_computes_vms_running_via_virsh():
"""check all vms are running via virsh list command"""
for compute in topology.list_openstack_nodes(group='compute'):
hostname = get_fqdn_from_topology_node(compute)
retry = tobiko.retry(timeout=120, interval=5)
for vm_id in get_compute_vms_df(hostname)['vm_id'].to_list():
for _ in retry:
if check_vm_running_via_virsh(compute, vm_id):
LOG.info(f"{vm_id} is running ok on "
f"{compute.hostname}")
break
else:
LOG.info(f"{vm_id} is not in running state on "
f"{compute.hostname}")