tobiko/tobiko/tripleo/processes.py

256 lines
11 KiB
Python

from __future__ import absolute_import
import io
import re
from oslo_log import log
import pandas
import tobiko
from tobiko.openstack import neutron
from tobiko.openstack import topology
from tobiko.tripleo import overcloud
from tobiko.shell import sh
from tobiko.shell import ssh
LOG = log.getLogger(__name__)
class OvercloudProcessesException(tobiko.TobikoException):
message = "not all overcloud processes are in running state, " \
"{process_error}"
def get_overcloud_node_processes_table(ssh_client: ssh.SSHClientType):
"""
get processes tables from overcloud node
returns :
[root@controller-0 ~]# ps -axw -o "%U" -o "|%p" -o "|%P" -o "|%C" -o "|%z" -o
"|%x" -o "|%c" -o "|%a" |grep -v 'ps -aux'|head
USER | PID| PPID|%CPU| VSZ| TIME|COMMAND |COMMAND
|overcloud_node
root | 1| 0| 1.3|246892|01:08:57|systemd |/usr/lib/systemd
controller-0 ...
/systemd --switched-root --system --deserialize 18
root | 2| 0| 0.0| 0|00:00:00|kthreadd |[kthreadd]
root | 3| 2| 0.0| 0|00:00:00|rcu_gp |[rcu_gp]
root | 4| 2| 0.0| 0|00:00:00|rcu_par_gp |[rcu_par_gp]
root | 6| 2| 0.0| 0|00:00:00|kworker/0:0H-ev|[kworker/0:0H
-events_highpri]
root | 8| 2| 0.0| 0|00:00:00|mm_percpu_wq |[mm_percpu_wq]
root | 9| 2| 0.0| 0|00:00:06|ksoftirqd/0 |[ksoftirqd/0]
root | 10| 2| 0.0| 0|00:04:28|rcu_sched |[rcu_sched]
root | 11| 2| 0.0| 0|00:00:05|migration/0 |[migration/0]
:return: dataframe of overcloud node processes dataframe
"""
output = sh.execute(
"ps -axw -o \"%U\" -o \"DELIM%p\" -o \"DELIM%P\" -o \"DELIM%C\" -o "
"\"DELIM%z\" -o \"DELIM%x\" -o \"DELIM%c\" -o \"DELIM%a\" |grep -v "
"'ps -axw' |sed 's/\"/''/g'",
ssh_client=ssh_client).stdout
stream = io.StringIO(output)
table: pandas.DataFrame = pandas.read_csv(
stream, sep='DELIM', header=None, skiprows=1)
table.replace(to_replace=' ', value="", regex=True, inplace=True)
table.columns = ['USER', 'PID', 'PPID', 'CPU', 'VSZ', 'TIME', 'PROCESS',
'PROCESS_ARGS']
# pylint: disable=unsupported-assignment-operation
hostname = sh.get_hostname(ssh_client=ssh_client)
table['overcloud_node'] = hostname
LOG.debug("Successfully got overcloud nodes processes status table")
return table
def get_overcloud_nodes_running_process(process):
"""
Check what nodes are running the specifies
process: exact str of a process name as seen in ps -axw -o "%c"
:return: list of overcloud nodes
"""
oc_procs_df = overcloud.get_overcloud_nodes_dataframe(
get_overcloud_node_processes_table)
oc_nodes_running_process = oc_procs_df.query('PROCESS=="{}"'.format(
process))['overcloud_node'].unique()
return oc_nodes_running_process
def check_if_process_running_on_overcloud(process):
"""
Check what nodes are running the specifies
process: exact str of a process name as seen in ps -axw -o "%c"
:return: list of overcloud nodes
"""
oc_procs_df = overcloud.get_overcloud_nodes_dataframe(
get_overcloud_node_processes_table)
if not oc_procs_df.query('PROCESS=="{}"'.format(process)).empty:
return True
else:
return False
class OvercloudProcessesStatus(object):
"""
class to handle processes checks,
checks that all of these are running in the overcloud:
'ovsdb-server','pcsd', 'corosync', 'beam.smp', 'mysqld', 'redis-server',
'haproxy', 'nova-conductor', 'nova-scheduler', 'neutron-server',
'nova-compute', 'glance-api'
"""
def __init__(self):
self.processes_to_check = ['ovsdb-server', 'pcsd', 'corosync',
'beam.smp', 'mysqld', 'redis-server',
'haproxy', 'nova-conductor',
'nova-scheduler', 'neutron-server',
'nova-compute', 'glance-api']
num_northd_proc = 'all' if overcloud.is_ovn_using_raft() else 1
self.ovn_processes_to_check_per_node = [{'name': 'ovn-controller',
'node_group': 'controller',
'number': 'all'},
{'name': 'ovn-controller',
'node_group': 'compute',
'number': 'all'},
{'name': 'ovn-northd',
'node_group': 'controller',
'number': num_northd_proc}]
self.oc_procs_df = overcloud.get_overcloud_nodes_dataframe(
get_overcloud_node_processes_table)
def _basic_overcloud_process_running(self, process_name):
# osp16/python3 process is "neutron-server:"
if process_name == 'neutron-server' and \
self.oc_procs_df.query('PROCESS=="{}"'.format(
process_name)).empty:
process_name = 'neutron-server:'
# osp17 mysqld process name is mysqld_safe
if process_name == 'mysqld' and \
self.oc_procs_df.query('PROCESS=="{}"'.format(
process_name)).empty:
process_name = 'mysqld_safe'
# redis not deployed on osp17 by default, only if some
# other services such as designate and octavia are deployed
if (process_name == 'redis-server' and
not overcloud.is_redis_expected()):
redis_message = ("redis-server not expected on OSP 17 "
"and later releases by default")
if self.oc_procs_df.query(
f'PROCESS=="{process_name}"').empty:
LOG.info(redis_message)
return
else:
raise OvercloudProcessesException(
process_error=redis_message)
if not self.oc_procs_df.query('PROCESS=="{}"'.format(
process_name)).empty:
LOG.info("overcloud processes status checks: "
"process {} is "
"in running state".format(process_name))
return
else:
LOG.info("Failure : overcloud processes status checks:"
"process {} is not running ".format(
process_name))
raise OvercloudProcessesException(
process_error="process {} is not running ".format(
process_name))
@property
def basic_overcloud_processes_running(self):
"""
Checks that the oc_procs_df dataframe has all of the list procs
:return: Bool
"""
for attempt in tobiko.retry(timeout=300., interval=1.):
try:
for process_name in self.processes_to_check:
self._basic_overcloud_process_running(process_name)
except OvercloudProcessesException:
if attempt.is_last:
LOG.error('Not all overcloud processes are running')
raise
LOG.info('Retrying overcloud processes: %s', attempt.details)
self.oc_procs_df = overcloud.get_overcloud_nodes_dataframe(
get_overcloud_node_processes_table)
# if all procs are running we can return true
return True
def _ovn_overcloud_process_validations(self, process_dict):
if not self.oc_procs_df.query('PROCESS=="{}"'.format(
process_dict['name'])).empty:
LOG.info("overcloud processes status checks: "
f"process {process_dict['name']} is "
"in running state")
ovn_proc_filtered_df = self.oc_procs_df.query(
'PROCESS=="{}"'.format(process_dict['name']))
if (process_dict['node_group'] not in
topology.list_openstack_node_groups()):
LOG.debug(
f"{process_dict['node_group']} is not "
"a node group part of this Openstack cloud")
return
node_list = [node.name
for node in
topology.list_openstack_nodes(
group=process_dict['node_group'])]
node_names_re = re.compile(r'|'.join(node_list))
node_filter = (ovn_proc_filtered_df.overcloud_node.
str.match(node_names_re))
# get the processes running on a specific type of nodes
ovn_proc_filtered_per_node_df = \
ovn_proc_filtered_df[node_filter]
total_num_processes = len(ovn_proc_filtered_per_node_df)
if type(process_dict['number']) == int:
expected_num_processes = process_dict['number']
elif process_dict['number'] == 'all':
expected_num_processes = len(node_list)
else:
raise ValueError("Unexpected value:"
f"{process_dict['number']}")
if expected_num_processes != total_num_processes:
raise OvercloudProcessesException(
"Unexpected number"
f" of processes {process_dict['name']} running on "
f"{process_dict['node_group']} nodes")
# process successfully validated
LOG.debug(f"{process_dict['name']} successfully validated on "
f"{process_dict['node_group']} nodes")
@property
def ovn_overcloud_processes_validations(self):
"""
Checks that the oc_procs_df dataframe has OVN processes running on the
expected overcloud node or nodes
:return: Bool
"""
if not neutron.has_ovn():
LOG.info("Networking OVN not configured")
return True
for attempt in tobiko.retry(timeout=300., interval=1.):
try:
for process_dict in self.ovn_processes_to_check_per_node:
self._ovn_overcloud_process_validations(process_dict)
except OvercloudProcessesException:
if attempt.is_last:
LOG.error('Unexpected number of OVN overcloud processes')
raise
LOG.info('Retrying OVN overcloud processes: %s',
attempt.details)
self.oc_procs_df = overcloud.get_overcloud_nodes_dataframe(
get_overcloud_node_processes_table)
# if all procs are running we can return true
return True