1153 lines
44 KiB
Python
1153 lines
44 KiB
Python
#
|
|
# Copyright (c) 2019 Wind River Systems, Inc.
|
|
#
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
#
|
|
|
|
|
|
import configparser
|
|
import datetime
|
|
import os.path
|
|
import re
|
|
import time
|
|
from io import StringIO
|
|
|
|
import pexpect
|
|
|
|
from consts.auth import Tenant
|
|
from consts.timeout import MTCTimeout
|
|
from keywords import system_helper, host_helper
|
|
from utils.clients.ssh import ControllerClient
|
|
from utils.tis_log import LOG
|
|
|
|
KILL_CMD = 'kill -9'
|
|
PROCESS_TYPES = ['sm', 'pmon', 'other']
|
|
KILL_PROC_EVENT_FORMAT = {
|
|
# documented
|
|
# 401.001 Service group <group> state change from <state> to <state> on
|
|
# host <host_name>
|
|
#
|
|
# actual in 2017-02-20_22-01-22
|
|
# clear | 400.001 | Service group cloud-services degraded;
|
|
# cinder-api(disabled, failed) |\
|
|
# service_domain=controller.service_group=cloud-services.host=controller-1
|
|
# log | 401.001 | Service group cloud-services state change from
|
|
# active-degraded to active on host
|
|
# set | 400.001 | Service group cloud-services degraded;
|
|
# cinder-api(disabled, failed) |\
|
|
# service_domain=controller.service_group=cloud-services.host=controller-1
|
|
|
|
# 'sm': ('401.001',
|
|
# actual in 2017-02-20_22-01-22
|
|
# clear 400.001 Service group cloud-services warning; nova-novnc(disabled,
|
|
# failed)
|
|
# service_domain=controller.service_group=cloud-services.host=controller-0
|
|
|
|
# 'sm': ('400.001',
|
|
# r'Service group ([^\s]+) ([^\s]+);\s*(.*)',
|
|
# r'service_domain=controller\.service_group=([^\.]+)\.host=(.*)'),
|
|
'sm': {
|
|
'event_id': '400.001',
|
|
'critical': (
|
|
r'Service group ([^\s]+) ([^\s]+);\s*(.*)',
|
|
r'service_domain=controller\.service_group=([^\.]+)\.host=(.*)'
|
|
),
|
|
'major': (
|
|
r'Service group ([^\s]+) ([^\s]+);\s*(.*)',
|
|
r'service_domain=controller\.service_group=([^\.]+)\.host=(.*)'
|
|
),
|
|
'minor': (
|
|
r'Service group ([^\s]+) ([^\s]+);\s*(.*)',
|
|
r'service_domain=controller\.service_group=([^\.]+)\.host=(.*)'
|
|
),
|
|
},
|
|
|
|
# set 200.006 controller-1 'acpid' process has failed.
|
|
# Auto recovery in progress. host=controller-1.process=acpid minor
|
|
'pmon': {
|
|
'event_id': '200.006',
|
|
# controller-1 critical 'sm' process has failed and could not be
|
|
# auto-recovered gracefully.
|
|
# Auto- recovery progression by host reboot is required and in
|
|
# progress. host=controller-1.process=sm
|
|
'critical': (
|
|
r'([^\s]+) ([^\s]+) \'([^\']+)\' process has ([^\s]+) and could '
|
|
r'not be auto-recovered gracefully. '
|
|
r'Auto.recovery progression by host reboot is required and in '
|
|
r'progress.',
|
|
r'host=([^\.]+)\.process=([^\s]+)'
|
|
),
|
|
# compute-2 is degraded due to the failure of its 'fsmond' process.
|
|
# Auto recovery of this major
|
|
# | host=compute-2.process= | major | process is in progress.
|
|
'major': (
|
|
r'([^\s]+) is ([^\s]+) due to the failure of its \'([^\']+)\' '
|
|
r'process. Auto recovery of this ([^\s]+) '
|
|
r'process is in progress.',
|
|
r'host=([^\.]+)\.process=([^\s]+)'
|
|
),
|
|
# clear 200.006 compute-2 'mtclogd' process has failed. Auto
|
|
# recovery in progress.
|
|
# host=compute-2.process=mtclogd minor
|
|
# "compute-2 'mtclogd' process has failed. Auto recovery in progress."
|
|
# set compute-1 'ntpd' process has failed. Manual recovery is required.
|
|
'minor': (
|
|
r"([^\s]+) '([^\']+)' process has ([^\s]+)\. [^\s]+ recovery.*",
|
|
r'host=([^\.]+)\.process=([^\s]+)'
|
|
),
|
|
},
|
|
}
|
|
AVAILABILITY_MAPPINGS = {'active': 'enabled', 'enabled': 'active'}
|
|
PMON_PROC_CONF_DIR = '/etc/pmon.d'
|
|
|
|
|
|
def get_pmon_process_info(name, host, conf_file=None, con_ssh=None):
|
|
"""
|
|
Get process info from its PMON config file
|
|
Args:
|
|
name (str): name of the PMON process
|
|
host (str): host on which the PROM process running
|
|
con_ssh: connection to the active controller
|
|
conf_file (str): configuration file for the PMON process
|
|
|
|
Returns (dict): settings of the process
|
|
|
|
"""
|
|
LOG.info('Get PMON process information for {}'.format(name))
|
|
|
|
if not conf_file:
|
|
file_name = '{}.conf'.format(name)
|
|
else:
|
|
file_name = conf_file
|
|
|
|
cmd = 'cat {}'.format(os.path.join(PMON_PROC_CONF_DIR, file_name))
|
|
|
|
with host_helper.ssh_to_host(host, con_ssh=con_ssh) as con0_ssh:
|
|
code, output = con0_ssh.exec_sudo_cmd(cmd)
|
|
|
|
if 0 != code or not output.strip():
|
|
LOG.error(
|
|
'Failed to read config file:{}/{} for PMON process:{} on host:{}, '
|
|
'code:{}, message:{}'.format(
|
|
PMON_PROC_CONF_DIR, file_name, name, host, code, output))
|
|
return {}
|
|
|
|
conf_parser = configparser.ConfigParser()
|
|
conf_parser.read_file(StringIO(output))
|
|
|
|
settings = {}
|
|
|
|
if 'process' in conf_parser.sections():
|
|
settings = {k.strip(): v.split(';')[0].strip() for k, v in
|
|
conf_parser.items('process')}
|
|
|
|
settings['interval'] = int(settings.get('interval', 5))
|
|
settings['debounce'] = int(settings.get('debounce', 20))
|
|
LOG.debug('process settings:{}'.format(settings))
|
|
return settings
|
|
|
|
|
|
def get_ancestor_process(name, host, cmd='', fail_ok=False, retries=5,
|
|
retry_interval=3, con_ssh=None):
|
|
"""
|
|
Get the ancestor of the processes with the given name and command-line if
|
|
any.
|
|
|
|
Args:
|
|
name: name of the process
|
|
host: host on which to find the process
|
|
cmd: executable name
|
|
fail_ok: do not throw exception when errors
|
|
retries: times to try before return
|
|
retry_interval: wait before next re-try
|
|
con_ssh: ssh connection/client to the active controller
|
|
|
|
Returns:
|
|
pid (int), process id, -1 if there is any error
|
|
ppid (int), parent process id, -1 if there is any error
|
|
cmdline (str) command line of the process
|
|
"""
|
|
retries = retries if retries > 1 else 3
|
|
retry_interval = retry_interval if retry_interval > 0 else 1
|
|
|
|
if cmd:
|
|
ps_cmd = r'ps -e -oppid,pid,cmd | /usr/bin/grep "{}\|{}" | ' \
|
|
r'/usr/bin/grep -v grep | /usr/bin/grep {}'.\
|
|
format(name, os.path.basename(cmd), cmd)
|
|
else:
|
|
ps_cmd = 'ps -e -oppid,pid,cmd | /usr/bin/grep "{}" | /usr/bin/grep ' \
|
|
'-v grep'.format(name)
|
|
|
|
code, output = -1, ''
|
|
if fail_ok:
|
|
for count in range(retries):
|
|
with host_helper.ssh_to_host(host, con_ssh=con_ssh) as con0_ssh:
|
|
code, output = con0_ssh.exec_cmd(ps_cmd, fail_ok=True)
|
|
if 0 == code and output.strip():
|
|
break
|
|
LOG.warn('Failed to run cli:{} on controller at retry:{:02d}, '
|
|
'wait:{} seconds and try again'.format(cmd, count,
|
|
retry_interval))
|
|
time.sleep(retry_interval)
|
|
else:
|
|
with host_helper.ssh_to_host(host, con_ssh=con_ssh) as con0_ssh:
|
|
code, output = con0_ssh.exec_cmd(ps_cmd, fail_ok=False)
|
|
|
|
if not (0 == code and output.strip()):
|
|
LOG.error(
|
|
'Failed to find process with name:{} and cmd:{}'.format(name, cmd))
|
|
return -1, -1, ''
|
|
|
|
procs = []
|
|
ppids = []
|
|
for line in output.strip().splitlines():
|
|
proc_attr = line.strip().split()
|
|
if not proc_attr:
|
|
continue
|
|
try:
|
|
ppid = int(proc_attr[0].strip())
|
|
pid = int(proc_attr[1].strip())
|
|
cmdline = ' '.join(proc_attr[2:])
|
|
LOG.info('ppid={}, pid={}\ncmdline={}'.format(ppid, pid, cmdline))
|
|
except IndexError:
|
|
LOG.warn(
|
|
'Failed to execute ps -p ?! cmd={}, line={}, output={}'.format(
|
|
cmd, line, output.strip()))
|
|
continue
|
|
|
|
if cmd and cmd not in cmdline:
|
|
continue
|
|
procs.append((pid, ppid, cmdline))
|
|
ppids.append(ppid)
|
|
|
|
if len(procs) <= 0:
|
|
LOG.error(
|
|
'Could not find process with name:{} and cmd:{}'.format(name, cmd))
|
|
return -1, -1, ''
|
|
|
|
pids = [v[1] for v in procs]
|
|
|
|
if len(pids) == 1:
|
|
LOG.info('porcs[0]:{}'.format(procs[0]))
|
|
return procs[0]
|
|
|
|
LOG.warn(
|
|
'Multiple ({}) parent processes?, ppids:{}'.format(len(ppids), ppids))
|
|
|
|
if '1' not in ppids:
|
|
LOG.warn(
|
|
'Init is not the grand parent process?, ppids:{}'.format(ppids))
|
|
|
|
for ppid, pid, cmdline in procs:
|
|
if pid in ppids and ppid not in pids and 1 != pid:
|
|
LOG.info('pid={}, ppid={}, cmdline={}'.format(pid, ppid, cmdline))
|
|
return pid, ppid, cmdline
|
|
|
|
LOG.error(
|
|
'Could not find process, procs:{}, ppids:{}, pids:{}'.format(procs,
|
|
ppids,
|
|
pids))
|
|
return -1, -1, ''
|
|
|
|
|
|
def verify_process_with_pid_file(pid, pid_file, con_ssh=None):
|
|
"""
|
|
Check if the given PID matching the PID in the specified pid_file
|
|
|
|
Args:
|
|
pid: process id
|
|
pid_file: the file containing the process id
|
|
con_ssh: ssh connnection/client to the host on which the process
|
|
resides
|
|
|
|
Returns:
|
|
|
|
"""
|
|
con_ssh = con_ssh or ControllerClient.get_active_controller()
|
|
|
|
code, output = con_ssh.exec_sudo_cmd('cat {} | head -n1'.format(pid_file),
|
|
fail_ok=False)
|
|
LOG.info('code={}, output={}'.format(code, output))
|
|
|
|
output = output.strip()
|
|
if not output or int(output) != pid:
|
|
LOG.info('Mismatched PID, expected:<{}>, from pid_file:<{}>, '
|
|
'pid_file={}'.format(pid, output, pid_file))
|
|
return False
|
|
else:
|
|
LOG.info(
|
|
'OK PID:{} matches with that from pid_file:{}, pid_file={}'.format(
|
|
pid, output.strip(), pid_file))
|
|
return True
|
|
|
|
|
|
def get_process_from_sm(name, con_ssh=None, pid_file='',
|
|
expecting_status='enabled-active'):
|
|
"""
|
|
Get the information for the process from SM, including PID, Name, Current
|
|
Status and Pid-File
|
|
|
|
Args:
|
|
name: name of the process
|
|
con_ssh: ssh connection/client to the active-controller
|
|
pid_file: known pid-file path/name to compare with
|
|
expecting_status: expected status of the process
|
|
|
|
Returns:
|
|
pid (int): process id
|
|
proc_name (str): process name
|
|
actual_status (str): actual/current status of the process
|
|
sm_pid_file (str): pid-file in records of SM
|
|
"""
|
|
con_ssh = con_ssh or ControllerClient.get_active_controller()
|
|
|
|
cmd = "true; NM={}; sudo sm-dump --impact --pid --pid_file | awk -v " \
|
|
"pname=$NM '{{ if ($1 == pname) print }}'; " \
|
|
"echo".format(name)
|
|
|
|
code, output = con_ssh.exec_sudo_cmd(cmd, fail_ok=True)
|
|
|
|
pid, proc_name, impact, sm_pid_file, actual_status = -1, '', '', '', ''
|
|
|
|
if 0 != code or not output:
|
|
LOG.warn(
|
|
'Cannot find the process:{} in SM with error code:\n{}\n'
|
|
'output:{}'.format(name, code, output))
|
|
return pid, proc_name, impact, sm_pid_file, actual_status
|
|
|
|
for line in output.splitlines():
|
|
if not line.strip():
|
|
continue
|
|
pid, proc_name, impact, sm_pid_file, actual_status = -1, '', '', '', ''
|
|
|
|
results_array = line.strip().split()
|
|
LOG.info('results_array={}'.format(results_array))
|
|
|
|
if len(results_array) != 6:
|
|
LOG.debug(
|
|
'Invalid format from output of sm-dump?! line={}\n'
|
|
'cmd={}'.format(line, cmd))
|
|
continue
|
|
|
|
proc_name = results_array[0]
|
|
if proc_name != name:
|
|
continue
|
|
|
|
expect_status = results_array[1]
|
|
actual_status = results_array[2]
|
|
|
|
if expect_status != actual_status:
|
|
LOG.warn(
|
|
'service:{} is not in expected status yet. expected:{}, '
|
|
'actual:{}. Retry'.format(
|
|
proc_name, expect_status, actual_status))
|
|
continue
|
|
|
|
if actual_status != expecting_status:
|
|
LOG.warn(
|
|
'service:{} is not in expected status yet. expected:{}, '
|
|
'actual:{}. Retry'.format(
|
|
proc_name, expecting_status, actual_status))
|
|
break
|
|
|
|
impact = results_array[3]
|
|
|
|
pid = int(results_array[4].strip())
|
|
if results_array[5] != sm_pid_file:
|
|
LOG.warn(
|
|
'pid_file not matching with that from SM-dump, pid_file={}, '
|
|
'sm-dump-pid_file={}'.format(
|
|
sm_pid_file, results_array[5]))
|
|
sm_pid_file = results_array[5]
|
|
|
|
if pid_file and sm_pid_file != pid_file:
|
|
LOG.warn(
|
|
'pid_file differs from input pid_file, pid_file={}, '
|
|
'sm-dump-pid_file={}'.format(
|
|
pid_file, sm_pid_file))
|
|
|
|
if sm_pid_file:
|
|
if not verify_process_with_pid_file(pid, sm_pid_file,
|
|
con_ssh=con_ssh):
|
|
LOG.warn(
|
|
'pid of service mismatch that from pid-file, pid:{}, '
|
|
'pid-file:{}, proc-name:{}'.format(
|
|
pid, sm_pid_file, proc_name))
|
|
# found
|
|
break
|
|
|
|
if -1 != pid:
|
|
host = system_helper.get_active_controller_name()
|
|
running, msg = is_process_running(pid, host)
|
|
if not running:
|
|
LOG.warn(
|
|
'Process not existing, name={}, pid={}, msg={}'.format(name,
|
|
pid,
|
|
msg))
|
|
return -1, '', '', '', ''
|
|
else:
|
|
LOG.info(
|
|
'OK, Process is running: name={}, pid={}, output={}'.format(
|
|
name, pid, msg))
|
|
|
|
return pid, proc_name, impact, actual_status, sm_pid_file
|
|
|
|
|
|
def is_controller_swacted(
|
|
prev_active, prev_standby,
|
|
swact_start_timeout=MTCTimeout.KILL_PROCESS_SWACT_NOT_START,
|
|
swact_complete_timeout=MTCTimeout.KILL_PROCESS_SWACT_COMPLETE,
|
|
con_ssh=None):
|
|
"""
|
|
Wait and check if the active-controller on the system was 'swacted' with
|
|
give time period
|
|
|
|
Args:
|
|
prev_active: previous active controller
|
|
prev_standby: previous standby controller
|
|
swact_start_timeout: check within this time frame if the swacting
|
|
started
|
|
swact_complete_timeout: check if the swacting (if any) completed in
|
|
this time period
|
|
con_ssh: ssh connection/client to the current
|
|
active-controller
|
|
|
|
Returns:
|
|
|
|
"""
|
|
LOG.info(
|
|
'Check if the controllers started to swact within:{}, and completing '
|
|
'swacting within:{}'.format(
|
|
swact_start_timeout, swact_complete_timeout))
|
|
|
|
code = -1
|
|
host = prev_active
|
|
for retry in range(1, 5):
|
|
LOG.info(
|
|
'retry{:02d}: checking if swacting triggered, '
|
|
'prev-active-controller={}'.format(
|
|
retry, prev_active))
|
|
code = 0
|
|
try:
|
|
code, msg = host_helper.wait_for_swact_complete(
|
|
host, con_ssh=con_ssh, fail_ok=True,
|
|
swact_start_timeout=swact_start_timeout,
|
|
swact_complete_timeout=swact_complete_timeout)
|
|
|
|
if 0 == code:
|
|
LOG.info(
|
|
'OK, host-swacted, prev-active:{}, pre-standby:{}, '
|
|
'code:{}, message:{}'.format(
|
|
prev_active, prev_active, code, msg))
|
|
return True
|
|
|
|
active, standby = system_helper.get_active_standby_controllers()
|
|
if active == prev_standby and standby == prev_active:
|
|
LOG.info(
|
|
'swacted?! prev-active:{} prev-standby:{}, cur active:{}, '
|
|
'cur standby:{}'.format(
|
|
prev_active, prev_standby, active, standby))
|
|
return True
|
|
break
|
|
|
|
except Exception as e:
|
|
LOG.warn(
|
|
'erred, indicating system is in unstable state, meaning '
|
|
'probably swacting is in process. '
|
|
'previous active-controller:{}, previous standby-controller:{}'
|
|
'\nerror message:{}'.format(prev_active, prev_standby, e))
|
|
|
|
if retry >= 4:
|
|
LOG.error(
|
|
'Fail the test after retry {} times, system remains in '
|
|
'unstable state, '
|
|
'meaning probably swacting is in process. previous '
|
|
'active-controller:{}, '
|
|
'previous standby-controller:{}\nerror message:{}'.
|
|
format(retry, prev_active, prev_standby, e))
|
|
raise
|
|
|
|
time.sleep(10)
|
|
|
|
return 0 == code
|
|
|
|
|
|
def wait_for_sm_process_events(service, host, target_status, expecting=True,
|
|
severity='major',
|
|
last_events=None, process_type='sm', timeout=60,
|
|
interval=3, con_ssh=None):
|
|
if process_type not in KILL_PROC_EVENT_FORMAT:
|
|
LOG.error('unknown type of process:{}'.format(process_type))
|
|
|
|
event_log_id = KILL_PROC_EVENT_FORMAT[process_type]['event_id']
|
|
reason_pattern, entity_id_pattern = KILL_PROC_EVENT_FORMAT[process_type][
|
|
severity][0:2]
|
|
|
|
if last_events is not None:
|
|
last_event = last_events['values'][0]
|
|
start_time = \
|
|
last_event[1].replace('-', '').replace('T', ' ').split('.')[0]
|
|
else:
|
|
start_time = ''
|
|
|
|
search_keys = {
|
|
'Event Log ID': event_log_id,
|
|
'Reason Text': reason_pattern,
|
|
'Entity Instance ID': entity_id_pattern,
|
|
}
|
|
|
|
expected_availability = target_status.get('availability', None)
|
|
|
|
matched_events = []
|
|
stop_time = time.time() + timeout
|
|
if expecting and (service == 'nova-novnc' or service == 'vim-webserver'):
|
|
stop_time = time.time() + timeout + 300
|
|
interval = 60
|
|
retry = 0
|
|
while time.time() < stop_time:
|
|
retry += 1
|
|
matched_events[:] = []
|
|
events_table = system_helper.get_events_table(
|
|
event_log_id=event_log_id, show_uuid=True,
|
|
start=start_time, limit=10, con_ssh=con_ssh, regex=True,
|
|
**search_keys)
|
|
|
|
if not events_table or not events_table['values']:
|
|
LOG.warn(
|
|
'run{:02d} for process:{}: Empty event table?!\n'
|
|
'evens_table:{}\nevent_id={}, '
|
|
'start={}\nkeys={}, severify={}'.
|
|
format(retry, service, events_table, event_log_id, start_time,
|
|
search_keys, severity))
|
|
continue
|
|
|
|
for event in events_table['values']:
|
|
try:
|
|
actual_event_id = event[3].strip()
|
|
if actual_event_id != event_log_id:
|
|
LOG.warn('Irrelevant event? event-list quering broken?!'
|
|
' looking-for-event-id={}, actual-event-id={}, '
|
|
'event={}'.
|
|
format(event_log_id, actual_event_id, event))
|
|
continue
|
|
|
|
actual_state = event[2]
|
|
if actual_state not in ('set', 'clear'):
|
|
LOG.info(
|
|
'State not matching, expected-state="log", '
|
|
'actual-state={}", event={}'.format(
|
|
actual_state, event))
|
|
continue
|
|
|
|
actual_reason = event[4].strip()
|
|
# ('cloud-services', 'active', 'active-degraded',
|
|
# 'controller-0;', ' glance-api(disabled, failed)')
|
|
m = re.match(reason_pattern, actual_reason)
|
|
if not m:
|
|
LOG.info(
|
|
'Not matched event:{},\nevent_id={}, start={}, '
|
|
'reason_text={}'.format(
|
|
event, event_log_id, start_time, reason_pattern))
|
|
continue
|
|
|
|
actual_group_status = m.group(2)
|
|
if actual_group_status not in ('active', expected_availability):
|
|
LOG.info(
|
|
'Group status not matching!, expected-status={}, '
|
|
'actual-status={}\nevent={}'.format(
|
|
expected_availability, actual_group_status, event))
|
|
continue
|
|
|
|
if 'host={}'.format(host) not in event[5]:
|
|
LOG.info(
|
|
'Host not matching, expected-host={}, acutal-host={}, '
|
|
'event={}'.format(
|
|
host, event[5], event))
|
|
continue
|
|
|
|
actual_service_name, status = m.group(3).split('(')
|
|
service_operational, service_availability = status.split(',')
|
|
matched_events.append(dict(
|
|
uuid=event[0],
|
|
event=event[1:-1],
|
|
service=actual_service_name,
|
|
serice_operational=service_operational,
|
|
service_availability=service_availability.strip().strip(
|
|
')'),
|
|
group_name=m.group(1),
|
|
group_prev_status=m.group(2),
|
|
group_status=m.group(3)
|
|
))
|
|
|
|
if not expecting:
|
|
LOG.error(
|
|
'Found set/clear event while it should NOT\nevent:'
|
|
'{}'.format(event))
|
|
return -1, tuple(matched_events)
|
|
|
|
matched_events = list(reversed(matched_events))
|
|
if len(matched_events) > 1:
|
|
if matched_events[-1]['event'][1] == 'clear' and \
|
|
matched_events[-2]['event'][1] == 'set':
|
|
LOG.info('OK, found matched events:{}'.format(
|
|
matched_events))
|
|
return 0, tuple(matched_events)
|
|
|
|
except IndexError:
|
|
LOG.error(
|
|
'CLI fm event-list changed its output format?\nsearching '
|
|
'keys={}'.format(
|
|
search_keys))
|
|
raise
|
|
|
|
LOG.warn(
|
|
'No matched event found at try:{}, will sleep {} seconds and retry'
|
|
'\nmatched events:\n{}, host={}'.format(retry, interval,
|
|
matched_events, host))
|
|
|
|
time.sleep(interval)
|
|
continue
|
|
|
|
LOG.info('No matched events:\n{}'.format(matched_events))
|
|
|
|
return -1, tuple()
|
|
|
|
|
|
def _check_status_after_killing_process(service, host, target_status,
|
|
expecting=True, process_type='sm',
|
|
last_events=None, con_ssh=None,
|
|
auth_info=Tenant.get('admin_platform')):
|
|
LOG.info(
|
|
'check for process:{} on host:{} expecting status:{}, process_type:'
|
|
'{}'.format(service, host, target_status, process_type))
|
|
|
|
try:
|
|
operational, availability = target_status.split('-')
|
|
except ValueError as e:
|
|
LOG.error('unknown host status:{}, error:{}'.format(target_status, e))
|
|
raise
|
|
|
|
expected = {'operational': operational, 'availability': availability}
|
|
|
|
if availability == 'warning':
|
|
LOG.info('impact:{} meaning: operational={}, availabiltiy={}'.format(
|
|
target_status, operational, availability))
|
|
code, _ = wait_for_sm_process_events(
|
|
service,
|
|
host,
|
|
expected,
|
|
expecting=expecting,
|
|
last_events=last_events,
|
|
process_type=process_type,
|
|
con_ssh=con_ssh)
|
|
|
|
return (0 == code) == expecting
|
|
|
|
total_wait = 120 if expecting else 30
|
|
time.sleep(1)
|
|
|
|
found = system_helper.wait_for_host_values(host, timeout=total_wait / 2,
|
|
con_ssh=con_ssh, fail_ok=True,
|
|
auth_info=auth_info, **expected)
|
|
|
|
if expecting and found:
|
|
LOG.debug('OK, process:{} in status:{} as expected.'.format(
|
|
service, target_status))
|
|
|
|
LOG.debug('Next, wait and verify the sytstem recovers')
|
|
expected = {'operational': 'enabled', 'availability': 'available'}
|
|
return system_helper.wait_for_host_values(
|
|
host, timeout=total_wait / 2, con_ssh=con_ssh, auth_info=auth_info,
|
|
fail_ok=True, **expected)
|
|
# return True
|
|
|
|
elif not expecting and found:
|
|
LOG.error('Unexpected status for process:{}, expected status:{}'.format(
|
|
service, expected))
|
|
return False
|
|
|
|
elif not expecting and not found:
|
|
LOG.info(
|
|
'OK, IMPACT did not happen which is correct. '
|
|
'target_status={}'.format(target_status))
|
|
return True
|
|
|
|
elif expecting and not found:
|
|
LOG.warn(
|
|
'host is not in expected status:{} for service:{}'.format(expected,
|
|
service))
|
|
|
|
code = wait_for_sm_process_events(
|
|
service, host, expected, expecting=expecting,
|
|
last_events=last_events,
|
|
process_type=process_type, con_ssh=con_ssh)[0]
|
|
|
|
return 0 == code
|
|
|
|
else:
|
|
# should never reach here
|
|
pass
|
|
|
|
|
|
def check_impact(impact, service_name, host='', last_events=None,
|
|
expecting_impact=False, process_type='sm', con_ssh=None,
|
|
timeout=80, **kwargs):
|
|
"""
|
|
Check if the expected IMPACT happens (or NOT) on the specified host
|
|
|
|
Args:
|
|
impact (str): system behavior to check, including:
|
|
swact --- the active controller swacting
|
|
enabled-degraded --- the host changed to
|
|
'enalbed-degraded' status
|
|
disabled-failed --- the host changed to
|
|
'disabled-failed' status
|
|
...
|
|
service_name (str): name of the service/process
|
|
host (str): the host to check
|
|
last_events (dict) the last events before action
|
|
expecting_impact (bool): if the IMPACT should happen timeout
|
|
process_type (str): type of the process: sm, pm, other
|
|
con_ssh: ssh connection/client to the active controller
|
|
timeout
|
|
**kwargs:
|
|
|
|
Returns:
|
|
boolean - whether the IMPACT happens as expected
|
|
|
|
"""
|
|
LOG.info(
|
|
'Checking impact:{} on host:{} after killing process:{}, '
|
|
'process_type={}'.format(
|
|
impact, host, service_name, process_type))
|
|
|
|
prev_active = kwargs.get('active_controller', 'controller-0')
|
|
prev_standby = kwargs.get('standby_controller', 'controller-1')
|
|
severity = kwargs.get('severity', 'major')
|
|
|
|
if impact == 'swact':
|
|
if expecting_impact:
|
|
return is_controller_swacted(prev_active, prev_standby,
|
|
con_ssh=con_ssh,
|
|
swact_start_timeout=max(timeout / 2,
|
|
20),
|
|
swact_complete_timeout=timeout)
|
|
else:
|
|
return not is_controller_swacted(prev_active, prev_standby,
|
|
con_ssh=con_ssh,
|
|
swact_start_timeout=timeout / 4)
|
|
|
|
elif impact in ('enabled-degraded', 'enabled-warning'):
|
|
return _check_status_after_killing_process(
|
|
service_name, host, target_status=impact,
|
|
expecting=expecting_impact,
|
|
process_type=process_type, last_events=last_events, con_ssh=con_ssh)
|
|
|
|
elif impact == 'disabled-failed':
|
|
|
|
if host == prev_active:
|
|
LOG.info(
|
|
'Killing PMON process:{} on active host:{} will trigger '
|
|
'swact. impact:{}, '
|
|
'severity:{}'.format(service_name, host, impact, severity))
|
|
swacted = is_controller_swacted(prev_active, prev_standby,
|
|
con_ssh=con_ssh,
|
|
swact_start_timeout=20,
|
|
swact_complete_timeout=timeout)
|
|
assert swacted, 'Active-controller must be swacted before been ' \
|
|
'taken into disabled-failed status'
|
|
|
|
operational, available = impact.split('-')
|
|
expected = {'operational': operational, 'available': available}
|
|
|
|
reached = system_helper.wait_for_host_values(host, timeout=timeout,
|
|
con_ssh=con_ssh,
|
|
fail_ok=True, **expected)
|
|
if reached and expecting_impact:
|
|
LOG.info(
|
|
'host {} reached status {} as expected after killing process '
|
|
'{}'.format(
|
|
host, expected, service_name))
|
|
return True
|
|
|
|
elif not reached and not expecting_impact:
|
|
LOG.info(
|
|
'host {} DID NOT reach status {} (as expected) after killing '
|
|
'process {}'.format(
|
|
host, expected, service_name))
|
|
return True
|
|
|
|
else:
|
|
LOG.error(
|
|
'Host:{} did not get into status:{} in {} seconds, seaching '
|
|
'for related events'.format(
|
|
host, expected, timeout))
|
|
|
|
# todo: it's better to do this in parallel with process-monitoring
|
|
expected = {'operational': 'enabled',
|
|
'available': ['available', 'degraded']}
|
|
reached = system_helper.wait_for_host_values(host, timeout=timeout,
|
|
con_ssh=con_ssh,
|
|
fail_ok=True,
|
|
**expected)
|
|
|
|
if reached:
|
|
LOG.info(
|
|
'Host:{} did not recover into status:{} in {} '
|
|
'seconds'.format(
|
|
host, expected, timeout))
|
|
return True
|
|
|
|
LOG.error(
|
|
'Host:{} did not get into status:{} in {} seconds, and there '
|
|
'is no related events'.format(
|
|
host, expected, timeout))
|
|
|
|
return False
|
|
else:
|
|
LOG.warn(
|
|
'impact-checker for impact:{} not implemented yet, '
|
|
'kwargs:{}'.format(impact, kwargs))
|
|
return False
|
|
|
|
|
|
def get_pmon_process_id(pid_file, host, con_ssh=None):
|
|
cmd = 'cat {} 2>/dev/null | head -n1 && echo 2>/dev/null'.format(pid_file)
|
|
|
|
with host_helper.ssh_to_host(host, con_ssh=con_ssh) as con:
|
|
code, output = con.exec_cmd(cmd)
|
|
|
|
if output.strip():
|
|
return int(output.strip())
|
|
|
|
return -1
|
|
|
|
|
|
def get_process_info(name, cmd='', pid_file='', host='', process_type='sm',
|
|
con_ssh=None,
|
|
auth_info=Tenant.get('admin_platform')):
|
|
"""
|
|
Get the information of the process with the specified name
|
|
|
|
Args:
|
|
name (str): name of the process
|
|
cmd (str): path of the executable
|
|
pid_file (str): path of the file containing the process id
|
|
host (str): host on which the process resides
|
|
process_type (str): type of service/process, must be one of 'sm',
|
|
'pm', 'other'
|
|
con_ssh: ssh connection/client to the active controller
|
|
auth_info
|
|
|
|
Returns:
|
|
|
|
"""
|
|
LOG.info('name:{} cmd={} pid_file={} host={} process_type={}'.format(
|
|
name, cmd, pid_file, host, process_type))
|
|
|
|
active_controller = system_helper.get_active_controller_name(
|
|
con_ssh=con_ssh, auth_info=auth_info)
|
|
if not host:
|
|
host = active_controller
|
|
|
|
if process_type == 'sm':
|
|
LOG.debug(
|
|
'to get_process_info for SM process:{} on host:{}'.format(name,
|
|
host))
|
|
|
|
if host != active_controller:
|
|
LOG.warn(
|
|
'Already swacted? host:{} is not the active controller now. '
|
|
'Active controller is {}'.format(
|
|
host, active_controller))
|
|
pid, name, impact, status, pid_file = get_process_from_sm(
|
|
name, con_ssh=con_ssh, pid_file=pid_file)
|
|
if status != 'enabled-active':
|
|
LOG.warn('SM process is in status:{}, not "enabled-active"'.format(
|
|
status))
|
|
if 'disabl' in status:
|
|
LOG.warn(
|
|
'Wrong controller? Or controller already swacted, '
|
|
'wait and try on the other controller')
|
|
time.sleep(10)
|
|
return get_process_from_sm(name, pid_file=pid_file)
|
|
|
|
return -1, name, impact, status, pid_file
|
|
else:
|
|
return pid, name, impact, status, pid_file
|
|
|
|
elif process_type == 'pmon':
|
|
pid = get_pmon_process_id(pid_file, host, con_ssh=con_ssh)
|
|
LOG.info('Found: PID={} for PMON process:{}'.format(pid, name))
|
|
return pid, name
|
|
|
|
else:
|
|
LOG.info('Try to find the process:{} using "ps"'.format(name))
|
|
|
|
pid = get_ancestor_process(name, host, cmd=cmd, con_ssh=con_ssh)[0]
|
|
if -1 == pid:
|
|
return -1, ''
|
|
|
|
return pid, name
|
|
|
|
|
|
def is_process_running(pid, host, con_ssh=None, retries=3, interval=3):
|
|
"""
|
|
Check if the process with the PID is existing
|
|
|
|
Args:
|
|
pid (int): process id
|
|
host (str): host the process resides
|
|
con_ssh: ssh connection/client to the host
|
|
retries (int): times to re-try if no process found before return
|
|
failure
|
|
interval (int): time to wait before next re-try
|
|
|
|
Returns:
|
|
boolean - true if the process existing, false otherwise
|
|
msg (str) - the details of the process or error messages
|
|
"""
|
|
cmd = 'ps -p {}'.format(pid)
|
|
for _ in range(retries):
|
|
with host_helper.ssh_to_host(host, con_ssh=con_ssh) as host_ssh:
|
|
code, output = host_ssh.exec_cmd(cmd, fail_ok=True)
|
|
if 0 != code:
|
|
LOG.warn(
|
|
'Process:{} DOES NOT exist, error:{}'.format(pid, output))
|
|
else:
|
|
return True, output
|
|
time.sleep(interval)
|
|
|
|
return False, ''
|
|
|
|
|
|
def _get_last_events_timestamps(limit=1, event_log_id=None, con_ssh=None,
|
|
auth_info=Tenant.get('admin_platform')):
|
|
latest_events = system_helper.get_events_table(limit=limit,
|
|
event_log_id=event_log_id,
|
|
show_uuid=True,
|
|
con_ssh=con_ssh,
|
|
auth_info=auth_info)
|
|
|
|
return latest_events
|
|
|
|
|
|
def kill_sm_process_and_verify_impact(name, cmd='', pid_file='', retries=2,
|
|
impact='swact', host='controller-0',
|
|
interval=20, action_timeout=90,
|
|
total_retries=3, process_type='sm',
|
|
on_active_controller=True, con_ssh=None,
|
|
auth_info=Tenant.get('admin_platform')):
|
|
"""
|
|
Kill the process with the specified name and verify the system behaviors
|
|
as expected
|
|
|
|
Args:
|
|
name (str): name of the process
|
|
cmd (str): executable of the process
|
|
pid_file (str): file containing process id
|
|
retries (int): times of killing actions upon which the
|
|
IMPACT will be triggered
|
|
impact (str): system behavior including:
|
|
swact -- active controller is swacted
|
|
enabled-degraded -- the status of the
|
|
service will change to
|
|
disabled-failed -- the status of the
|
|
service will change to
|
|
...
|
|
host (str): host to test on
|
|
interval (int): least time to wait between kills
|
|
action_timeout (int): kills and impact should happen within this
|
|
time frame
|
|
total_retries (int): total number of retries for whole kill and
|
|
wait actions
|
|
process_type (str): valid types are: sm, pmon, other
|
|
on_active_controller (boolean):
|
|
con_ssh: ssh connection/client to the active controller
|
|
auth_info
|
|
|
|
Returns: (pid, host)
|
|
pid:
|
|
>0 suceess, the final PID of the process
|
|
-1 fail because of impact NOT happening after killing the
|
|
process up to threshold times
|
|
-2 fail because of impact happening before killing threshold times
|
|
-3 fail after try total_retries times
|
|
host:
|
|
the host tested on
|
|
"""
|
|
active_controller, standby_controller = \
|
|
system_helper.get_active_standby_controllers(con_ssh=con_ssh,
|
|
auth_info=auth_info)
|
|
|
|
if on_active_controller:
|
|
LOG.info(
|
|
'on active controller: {}, host:{}'.format(active_controller, host))
|
|
|
|
host = active_controller
|
|
con_ssh = con_ssh or ControllerClient.get_active_controller()
|
|
|
|
LOG.info('on host: {}'.format(host))
|
|
|
|
if total_retries < 1 or retries < 1:
|
|
LOG.error(
|
|
'retries/total-retries < 1? retires:{}, total retries:{}'.format(
|
|
retries, total_retries))
|
|
return None
|
|
count = 0
|
|
for i in range(1, total_retries + 1):
|
|
LOG.info(
|
|
'retry:{:02d} kill the process:{} and verify impact:{}'.format(
|
|
i, name, impact))
|
|
|
|
exec_times = []
|
|
killed_pids = []
|
|
|
|
timeout = time.time() + action_timeout * (
|
|
retries / 2 if retries > 2 else 1)
|
|
|
|
while time.time() < timeout:
|
|
count += 1
|
|
|
|
LOG.debug(
|
|
'retry{:02d}-{:02d}: Failed to get process id for {} on '
|
|
'host:{}, swacted unexpectedly?'.format(
|
|
i, count, name, host))
|
|
|
|
try:
|
|
pid, proc_name = get_process_info(name, cmd=cmd, host=host,
|
|
process_type=process_type,
|
|
pid_file=pid_file,
|
|
con_ssh=con_ssh)[0:2]
|
|
|
|
except pexpect.exceptions.EOF:
|
|
LOG.warn(
|
|
'retry{:02d}-{:02d}: Failed to get process id for {} on '
|
|
'host:{}, swacted unexpectedly?'.format(
|
|
i, count, name, host))
|
|
time.sleep(interval / 3.0)
|
|
continue
|
|
|
|
if -1 == pid:
|
|
LOG.error(
|
|
'retry{:02d}-{:02d}: Failed to get PID for process with '
|
|
'name:{}, cmd:{}, '
|
|
'wait and retries'.format(i, count, name, cmd))
|
|
time.sleep(interval / 3.0)
|
|
continue
|
|
|
|
if killed_pids and pid in killed_pids:
|
|
LOG.warn(
|
|
'retry{:02d}-{:02d}: No new process re-created, '
|
|
'prev-pid={}, cur-pid={}'.format(
|
|
i, count, killed_pids[-1], pid))
|
|
time.sleep(interval / 3.0)
|
|
continue
|
|
|
|
last_killed_pid = killed_pids[-1] if killed_pids else None
|
|
killed_pids.append(pid)
|
|
last_kill_time = exec_times[-1] if exec_times else None
|
|
exec_times.append(datetime.datetime.utcnow())
|
|
|
|
latest_events = _get_last_events_timestamps(
|
|
event_log_id=KILL_PROC_EVENT_FORMAT[process_type]['event_id'],
|
|
limit=10)
|
|
|
|
LOG.info(
|
|
'retry{:02d}-{:02d}: before kill CLI, proc_name={}, pid={}, '
|
|
'last_killed_pid={}, last_kill_time={}'.format(
|
|
i, count, proc_name, pid, last_killed_pid, last_kill_time))
|
|
|
|
LOG.info('\tactive-controller={}, standby-controller={}'.format(
|
|
active_controller, standby_controller))
|
|
|
|
kill_cmd = '{} {}'.format(KILL_CMD, pid)
|
|
|
|
with host_helper.ssh_to_host(host, con_ssh=con_ssh) as con:
|
|
code, output = con.exec_sudo_cmd(kill_cmd, fail_ok=True)
|
|
if 0 != code:
|
|
# it happens occasionaly
|
|
LOG.error('Failed to kill pid:{}, cmd={}, output=<{}>, '
|
|
'at run:{}, already terminated?'.format(
|
|
pid, kill_cmd, output, count))
|
|
|
|
if count < retries:
|
|
# IMPACT should not happen yet
|
|
if not check_impact(impact, proc_name,
|
|
last_events=latest_events,
|
|
active_controller=active_controller,
|
|
standby_controller=standby_controller,
|
|
expecting_impact=False,
|
|
process_type=process_type, host=host,
|
|
con_ssh=con_ssh):
|
|
LOG.error(
|
|
'Impact:{} observed unexpectedly, it should happen '
|
|
'only after killing {} times, '
|
|
'actual killed times:{}'.format(impact, retries, count))
|
|
return -2, host
|
|
|
|
LOG.info(
|
|
'retry{:02d}-{:02d}: OK, NO impact as expected, impact={}, '
|
|
'will kill it another time'.format(i, count, impact))
|
|
|
|
time.sleep(max(interval * 1 / 2.0, 5))
|
|
|
|
else:
|
|
no_standby_controller = standby_controller is None
|
|
expecting_impact = True if not no_standby_controller else False
|
|
if not check_impact(
|
|
impact, proc_name, last_events=latest_events,
|
|
active_controller=active_controller,
|
|
standby_controller=standby_controller,
|
|
expecting_impact=expecting_impact,
|
|
process_type=process_type, host=host, con_ssh=con_ssh):
|
|
LOG.error(
|
|
'No impact after killing process {} {} times, while '
|
|
'{}'.format(proc_name, count,
|
|
('expecting impact' if expecting_impact
|
|
else 'not expecting impact')))
|
|
|
|
return -1, host
|
|
|
|
LOG.info(
|
|
'OK, final retry{:02d}-{:02d}: OK, IMPACT happened '
|
|
'(if applicable) as expected, '
|
|
'impact={}'.format(i, count, impact))
|
|
|
|
active_controller, standby_controller = \
|
|
system_helper.get_active_standby_controllers(
|
|
con_ssh=con_ssh)
|
|
|
|
LOG.info(
|
|
'OK, after impact:{} (tried:{} times), '
|
|
'now active-controller={}, standby-controller={}'.format(
|
|
impact, count, active_controller, standby_controller))
|
|
|
|
pid, proc_name = get_process_info(name, cmd=cmd, host=host,
|
|
pid_file=pid_file,
|
|
process_type=process_type,
|
|
con_ssh=con_ssh)[0:2]
|
|
|
|
return pid, active_controller
|
|
|
|
return -3, host
|
|
|
|
|
|
def wait_for_sm_dump_services_active(timeout=60, fail_ok=False, con_ssh=None,
|
|
auth_info=Tenant.get('admin_platform')):
|
|
"""
|
|
Wait for all services
|
|
Args:
|
|
timeout:
|
|
fail_ok:
|
|
con_ssh:
|
|
auth_info
|
|
|
|
Returns:
|
|
|
|
"""
|
|
active_controller = system_helper.get_active_controller_name(
|
|
con_ssh=con_ssh, auth_info=auth_info)
|
|
return host_helper.wait_for_sm_dump_desired_states(
|
|
controller=active_controller, timeout=timeout, fail_ok=fail_ok)
|