Merge "Improve report tool system_info plugin behavior"

This commit is contained in:
Zuul 2023-05-25 18:11:55 +00:00 committed by Gerrit Code Review
commit d98212da96
13 changed files with 393 additions and 204 deletions

@ -1,6 +1,6 @@
########################################################################
#
# Copyright (c) 2022 Wind River Systems, Inc.
# Copyright (c) 2022 - 2023 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
@ -109,8 +109,8 @@ class Correlator:
ctrlr_link_down = re.findall(
r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3} (.+) "
"sm:", line)[0]
elif (re.search("Neighbor (.+) is now in the down", line)
and start_time and not ctrlr_down):
elif (re.search("Neighbor (.+) is now in the down", line) and
start_time and not ctrlr_down):
ctrlr_down = re.findall(
r"Neighbor \((.+)\) received event", line)[0]
elif (re.search("Service (.+) is failed and has reached max "
@ -121,8 +121,8 @@ class Correlator:
r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3} (.+) sm:",
line)[0]
elif (svc_failed and re.search(
"active-failed\\s+\\| disabling-failed\\s+\\| "
+ svc_failed, line)):
"active-failed\\s+\\| disabling-failed\\s+\\| " +
svc_failed, line)):
if re.search(r"\| go-active-failed\s+\|", line):
go_active_failed = True
else:
@ -140,40 +140,40 @@ class Correlator:
start_time = start_time.strftime("%Y-%m-%dT%H:%M:%S")
end_time = end_time.strftime("%Y-%m-%dT%H:%M:%S")
if link_down:
data.append(start_time + " to " + end_time
+ " Uncontrolled swact, refer to SM logs "
data.append(start_time + " to " + end_time +
" Uncontrolled swact, refer to SM logs "
"for in-depth analysis, original active "
"controller: " + ctrlr_link_down + "\n")
elif ctrlr_down:
if hb_loss:
data.append(start_time + " to " + end_time
+ " Uncontrolled swact due to "
data.append(start_time + " to " + end_time +
" Uncontrolled swact due to "
"spontaneous reset of active "
"controller " + ctrlr_down + "\n")
else:
data.append(start_time + " to " + end_time
+ " Uncontrolled swact likely due to "
data.append(start_time + " to " + end_time +
" Uncontrolled swact likely due to "
"spontaneous reset of active "
"controller " + ctrlr_down + "\n")
elif svc_failed:
if active_failed and go_active_failed:
data.append(start_time + " to " + end_time
+ " Uncontrolled swact due to service "
data.append(start_time + " to " + end_time +
" Uncontrolled swact due to service "
"failure (" + svc_failed + ") twice "
"in 2 minutes was unsuccessful so "
"\"bounced back\" to original active "
"controller " + ctrlr_svc_fail + "\n")
elif active_failed:
data.append(start_time + " to " + end_time
+ " Uncontrolled swact due to service "
data.append(start_time + " to " + end_time +
" Uncontrolled swact due to service "
"failure (" + svc_failed + ") twice "
"in 2 minutes on active controller "
+ ctrlr_svc_fail + "\n")
"in 2 minutes on active controller " +
ctrlr_svc_fail + "\n")
else:
data.append(start_time + " to " + end_time
+ " Uncontrolled swact likely due to "
"service failure (" + svc_failed
+ ") twice in 2 minutes on active "
data.append(start_time + " to " + end_time +
" Uncontrolled swact likely due to "
"service failure (" + svc_failed +
") twice in 2 minutes on active "
"controller " + ctrlr_svc_fail + "\n")
start_time = end_time = svc_failed = None
@ -218,19 +218,19 @@ class Correlator:
host[0] + " auto recovery disabled", line)):
old = data[-1].split("due", 1)
if len(old) == 1:
data[-1] = (data[-1][:-1]
+ " (auto recovery disabled)\n")
data[-1] = (data[-1][:-1] +
" (auto recovery disabled)\n")
else:
data[-1] = (old[0]
+ "(auto recovery disabled) due"
+ old[1])
data[-1] = (old[0] +
"(auto recovery disabled) due" +
old[1])
auto_recov_dis = True
elif "GOENABLED Failed" in line and not goenable_start:
goenable_start, auto_recov_dis = line[0:19], False
goenable_host = re.findall(
"Error : (.+) got GOENABLED Failed", line)[0]
elif ("configuration failed or incomplete" in line
and not config_start):
elif ("configuration failed or incomplete" in line and not
config_start):
config_start = datetime.strptime(line[0:19],
"%Y-%m-%dT%H:%M:%S")
auto_recov_dis = False
@ -248,8 +248,8 @@ class Correlator:
if (re.search(host + " (.+) Heartbeat Loss (.+) "
"\\(during recovery soak\\)", line)):
old = data[-1]
data[-1] = (old[0:23] + line[0:19] + old[42:-1]
+ " (recovery over disabled due to "
data[-1] = (old[0:23] + line[0:19] + old[42:-1] +
" (recovery over disabled due to "
"heartbeat soak failure)\n")
else:
hb_loss_start = line[0:19]
@ -257,15 +257,15 @@ class Correlator:
hb_loss_host = re.findall("Error : (.+) [CM]", line)[0]
# Check if previous failure recorded was heartbeat loss due to
# missing heartbeat messages
elif ("regained MTCALIVE from host that has rebooted" in line
and data and re.search(r"Heartbeat loss failure (.+) "
r"\(recovery over disabled\)",
data[-1])):
elif ("regained MTCALIVE from host that rebooted" in line and
data and re.search(
r"Heartbeat loss failure (.+) "
r"\(recovery over disabled\)", data[-1])):
host = re.findall("failure on (.+) due to", data[-1])[0]
if re.search(host + " regained MTCALIVE", line):
old = data[-1].split("due", 1)[0]
data[-1] = (old[0:23] + line[0:19] + old[42:]
+ "due to uncontrolled reboot\n")
data[-1] = (old[0:23] + line[0:19] + old[42:] +
"due to uncontrolled reboot\n")
elif (hb_loss_start and not comm_loss and hb_loss_host and
re.search(hb_loss_host + " Loss Of Communication for 5 "
"seconds", line)):
@ -282,14 +282,14 @@ class Correlator:
"threshold reached", line)):
goenable_end = line[0:19]
if goenable_tst_f:
data.append(goenable_start + " to " + goenable_end
+ " Go-enable test failure on "
+ goenable_host + " due to failing of "
+ goenable_tst_f + "\n")
data.append(goenable_start + " to " + goenable_end +
" Go-enable test failure on " +
goenable_host + " due to failing of " +
goenable_tst_f + "\n")
else:
data.append(goenable_start + " to " + goenable_end
+ " Go-enable test failure on "
+ goenable_host + " due to unknown test "
data.append(goenable_start + " to " + goenable_end +
" Go-enable test failure on " +
goenable_host + " due to unknown test "
"failing\n")
goenable_start = goenable_end = goenable_host = None
@ -299,8 +299,8 @@ class Correlator:
"threshold reached", line)):
config_end = datetime.strptime(line[0:19],
"%Y-%m-%dT%H:%M:%S")
if (config_tst_f
!= "/etc/goenabled.d/config_goenabled_check.sh"):
if (config_tst_f !=
"/etc/goenabled.d/config_goenabled_check.sh"):
try:
daemon_fail = self.search_daemon_fail(
config_start, config_end, config_host)
@ -308,8 +308,8 @@ class Correlator:
logger.error(e)
if (config_tst_f ==
"/etc/goenabled.d/config_goenabled_check.sh"
or daemon_fail):
"/etc/goenabled.d/config_goenabled_check.sh" or
daemon_fail):
try:
puppet_error = self.search_puppet_error(
config_start, config_end)
@ -320,22 +320,22 @@ class Correlator:
"%Y-%m-%dT%H:%M:%S")
config_end = config_end.strftime("%Y-%m-%dT%H:%M:%S")
if puppet_error:
data.append(config_start + " to " + config_end
+ " Configuration failure on "
+ config_host + " due to:\n"
+ puppet_error)
data.append(config_start + " to " + config_end +
" Configuration failure on " +
config_host + " due to:\n" +
puppet_error)
else:
data.append(config_start + " to " + config_end
+ " Configuration failure on "
+ config_host
+ " due to unknown cause\n")
data.append(config_start + " to " + config_end +
" Configuration failure on " +
config_host +
" due to unknown cause\n")
else:
config_start = config_start.strftime(
"%Y-%m-%dT%H:%M:%S")
config_end = config_end.strftime("%Y-%m-%dT%H:%M:%S")
data.append(config_start + " to " + config_end
+ " Possible configuration failure on "
+ config_host + "\n")
data.append(config_start + " to " + config_end +
" Possible configuration failure on " +
config_host + "\n")
config_start = config_end = config_host = None
config_tst_f = puppet_error = None
@ -344,9 +344,9 @@ class Correlator:
re.search(hb_loss_host + " Connectivity Recovered ",
line)):
hb_loss_end = line[0:19]
data.append(hb_loss_start + " to " + hb_loss_end
+ " Heartbeat loss failure on " + hb_loss_host
+ " due to too many missing heartbeat "
data.append(hb_loss_start + " to " + hb_loss_end +
" Heartbeat loss failure on " + hb_loss_host +
" due to too many missing heartbeat "
"messages\n")
hb_loss_start = hb_loss_end = hb_loss_host = None
@ -355,9 +355,9 @@ class Correlator:
hb_loss_host and re.search(
hb_loss_host + " Graceful Recovery Wait", line)):
hb_loss_end = line[0:19]
data.append(hb_loss_start + " to " + hb_loss_end
+ " Heartbeat loss failure on " + hb_loss_host
+ " due to too many missing heartbeat "
data.append(hb_loss_start + " to " + hb_loss_end +
" Heartbeat loss failure on " + hb_loss_host +
" due to too many missing heartbeat "
"messages (recovery over disabled)\n")
hb_loss_start = hb_loss_end = hb_loss_host = None
@ -383,8 +383,8 @@ class Correlator:
if (re.search("Error : " + host + " (.+) Heartbeat Loss ",
line)):
date = datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S")
if (date >= start_time - timedelta(minutes=1)
and date <= end_time):
if (date >= start_time - timedelta(minutes=1) and
date <= end_time):
hb_loss = True
break
@ -405,12 +405,12 @@ class Correlator:
with open(file_path, "r") as daemon_failures:
for line in daemon_failures:
if (re.search("\\d " + host
+ " (.+) Failed to run the puppet manifest",
if (re.search("\\d " + host +
" (.+) Failed to run the puppet manifest",
line)):
date = datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S")
if (date >= start_time - timedelta(seconds=10)
and date <= end_time):
if (date >= start_time - timedelta(seconds=10) and
date <= end_time):
daemon_fail = True
break
@ -433,8 +433,8 @@ class Correlator:
for line in puppet_errors:
if "Error: " in line:
date = datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S")
if (date >= start_time - timedelta(seconds=10)
and date <= end_time):
if (date >= start_time - timedelta(seconds=10) and
date <= end_time):
puppet_log = line
break
@ -460,13 +460,13 @@ class Correlator:
if "force failed by SM" in line:
host = re.findall("Error : (.+) is being", line)[0]
if hostname == "all" or host == hostname:
data.append(line[0:19] + " " + host
+ " force failed by SM\n")
data.append(line[0:19] + " " + host +
" force failed by SM\n")
elif "Graceful Recovery Failed" in line:
host = re.findall("Info : (.+) Task:", line)[0]
if hostname == "all" or host == hostname:
data.append(line[0:19] + " " + host
+ " graceful recovery failed\n")
data.append(line[0:19] + " " + host +
" graceful recovery failed\n")
elif "MNFA ENTER" in line:
mnfa_start = datetime.strptime(line[0:19],
"%Y-%m-%dT%H:%M:%S")
@ -487,9 +487,9 @@ class Correlator:
"%Y-%m-%dT%H:%M:%S")
mnfa_duration -= mnfa_start
mnfa_start = mnfa_start.strftime("%Y-%m-%dT%H:%M:%S")
data.append(mnfa_start + " Multi-node failure avoidance "
+ "(duration: " + str(mnfa_duration)
+ "; history:" + mnfa_hist + ")\n")
data.append(mnfa_start + " Multi-node failure avoidance " +
"(duration: " + str(mnfa_duration) +
"; history:" + mnfa_hist + ")\n")
mnfa_start, mnfa_hist = None, ""
@ -506,9 +506,9 @@ class Correlator:
svc_failed = re.findall(
r"Service \((.+)\) is failed", line)[0]
if hostname == "all" or host == hostname:
data.append(line[0:19] + " " + host
+ " service failure (" + svc_failed
+ ")\n")
data.append(line[0:19] + " " + host +
" service failure (" + svc_failed +
")\n")
return data
@ -524,7 +524,9 @@ class Correlator:
# Open 'alarm' output file from alarm plugin and read it
file_path = os.path.join(self.plugin_output_dir, "alarm")
if not os.path.exists(file_path):
logger.debug("No alarms found")
return data
with open(file_path, "r") as alarm:
extract = False
for line in alarm:
@ -547,8 +549,8 @@ class Correlator:
temp = []
for entry in data:
temp.append(entry["name"] + " - set: " + str(entry["set"])
+ ", clear: " + str(entry["clear"]) + "\n")
temp.append(entry["name"] + " - set: " + str(entry["set"]) +
", clear: " + str(entry["clear"]) + "\n")
data = temp
return data
@ -572,8 +574,8 @@ class Correlator:
host = re.findall("Info : (.+) is ENABLED", line)[0]
state = re.findall("is (.+)\n", line)[0].lower()
if hostname == "all" or hostname in host:
data.append(line[0:19] + " " + host + " " + state
+ "\n")
data.append(line[0:19] + " " + host + " " +
state + "\n")
elif "locked-disabled" in line:
host = re.findall(
"Info : (.+) u?n?locked-disabled", line)[0]

@ -1,6 +1,6 @@
########################################################################
#
# Copyright (c) 2022 Wind River Systems, Inc.
# Copyright (c) 2022 - 2023 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
@ -23,6 +23,7 @@
########################################################################
import logging
import mmap
import os
import re
import subprocess
@ -48,6 +49,10 @@ sys.dont_write_bytecode = True
logger = logging.getLogger(__name__)
# regex expression used to get the hostname from the host dir name
# eg: chops '_20221201.213332' off of controller-0_20221201.213332
regex_chop_bundle_date = r"_\d{8}\.\d{6}"
class ExecutionEngine:
def __init__(self, opts, input_dir, output_dir):
@ -62,7 +67,9 @@ class ExecutionEngine:
self.opts = opts
self.hosts = {"controllers": {}, "workers": {}, "storages": {}}
self.active_controller_directory = None
self.active_controller_hostname = None
self.host_dirs = []
self.hostnames = []
if not os.path.isdir(input_dir):
logger.error("Error: Invalid input directory: %s", input_dir)
@ -89,37 +96,48 @@ class ExecutionEngine:
except subprocess.CalledProcessError as e:
logger.error(e)
# TODO: Need a better way to figure out the active controller
# Consider getting the system infop from all hosts.
#
# Determine the active controller and load system info from it.
for folder in (f.path for f in os.scandir(input_dir)):
logger.debug("folder: %s", os.path.basename(folder))
# skip over the tarballs
logger.debug("base folder: %s", os.path.basename(folder))
# skip over files (the tarballs)
if not os.path.isdir(folder):
continue
basename = os.path.basename(folder)
if basename == "report_analysis":
continue
# Get the hostname from the host folder
hostname = re.sub(regex_chop_bundle_date, "", basename)
self.hostnames.append(hostname)
logger.debug("searching for active controller: %s" % hostname)
host_dir = folder
extra_path = os.path.join(host_dir, "var", "extra")
database_path = os.path.join(host_dir, extra_path, "database")
host_info_path = os.path.join(host_dir, extra_path, "host.info")
if os.path.isdir(host_dir):
extra_path = os.path.join(host_dir, "var", "extra")
# don't analyse a directory that doesn't contain
# a 'var/extra' dir.
if not os.path.exists(extra_path):
logger.warning("missing var/extra for %s" % hostname)
continue
database_path = os.path.join(host_dir, extra_path, "database")
hostinfo_path = os.path.join(host_dir, extra_path, "host.info")
if os.path.exists(database_path):
if os.listdir(database_path):
logger.info("Active Ctrl: %s" % hostname)
self.active_controller_directory = folder
self.active_controller_hostname = hostname
self.host_dirs.append(host_dir)
logger.debug("Host Dirs: %s", self.host_dirs)
if os.path.exists(host_info_path):
# save host folder path based on nodetype
if os.path.exists(hostinfo_path):
hostname, subfunction = self._extract_subfunction(
host_info_path)
hostinfo_path)
if "controller" in subfunction:
self.hosts["controllers"][hostname] = folder
elif "worker" in subfunction:
@ -127,9 +145,8 @@ class ExecutionEngine:
elif "storage" in subfunction:
self.hosts["storages"][hostname] = folder
self.active_controller_directory = folder
if not self.active_controller_directory:
raise ValueError("Active controller not found")
logger.error("Active Ctrl: NOT FOUND")
def execute(self, plugins, output_dir):
"""Run a list of plugins
@ -178,6 +195,7 @@ class ExecutionEngine:
os.path.join(folderpath, file)
for file in plugin.state["files"]
],
plugin.state["exclude"],
)
# creating output file
@ -186,37 +204,56 @@ class ExecutionEngine:
f"substring_{hostname}",
)
if self.opts.verbose:
logger.info("... output at "
+ os.path.abspath(output_file))
with open(output_file, "w") as file:
file.write(
f"Date range: {self.opts.start} until "
f"{self.opts.end}\n"
)
file.write(
f"substrings: "
f"{' '.join(plugin.state['substring'])}\n"
)
for line in events:
if line[-1] == "\n":
file.write(line)
else:
file.write(line + "\n")
logger.info("... output at " +
os.path.abspath(output_file))
if events:
with open(output_file, "w") as file:
file.write(
f"Date range: {self.opts.start} until "
f"{self.opts.end}\n"
)
file.write(
f"substrings: "
f"{' '.join(plugin.state['substring'])}\n"
)
for line in events:
if line[-1] == "\n":
file.write(line)
else:
file.write(line + "\n")
else:
if plugin.state["algorithm"] == algorithms.SYSTEM_INFO:
for host_dir in self.host_dirs:
info = system_info(host_dir)
system_info_output = os.path.join(plugin_output_dir,
"system_info")
with open(system_info_output, "w") as file:
for i in info:
file.write(i + "\n")
for k, v in self.hosts.items():
file.write(f"{k}: {','.join(v.keys())}\n")
# Get system info of the active controller first
# and then put the system info of each host in the
# system info output folder.
system_info_output = os.path.join(plugin_output_dir,
"system_info")
if os.path.exists(system_info_output):
os.remove(system_info_output)
if self.opts.verbose:
logger.info(processing + ", output at " +
os.path.abspath(system_info_output))
hostname = None
host_dir = None
if self.active_controller_directory is None:
hostname = re.sub(regex_chop_bundle_date, "",
os.path.basename(self.host_dirs[0]))
host_dir = self.host_dirs[0]
else:
hostname = self.active_controller_hostname
host_dir = self.active_controller_directory
system_info(hostname, host_dir,
system_info_output,
self.hosts, True)
for host_dir in self.host_dirs:
if host_dir != self.active_controller_directory:
hostname = re.sub(regex_chop_bundle_date, "",
os.path.basename(host_dir))
system_info(hostname,
host_dir,
system_info_output,
None,
False)
elif plugin.state["algorithm"] == algorithms.AUDIT:
hosts = {}
@ -284,7 +321,6 @@ class ExecutionEngine:
file.write(f"{k}:\n")
for date in v["dates"]:
file.write(f" {date}\n")
# creating output log file
with open(log_output, "w") as file:
for k, v in logs.items():
@ -312,14 +348,16 @@ class ExecutionEngine:
self._create_output_file(
"maintenance_errors", plugin_output_dir,
maintenance_errors(self.hosts, self.opts.start,
self.opts.end),
self.opts.end,
plugin.state["exclude"]),
processing
)
elif plugin.state["algorithm"] == algorithms.DAEMON_FAILURES:
self._create_output_file(
"daemon_failures", plugin_output_dir,
daemon_failures(self.hosts, self.opts.start,
self.opts.end),
self.opts.end,
plugin.state["exclude"]),
processing
)
elif plugin.state["algorithm"] == algorithms.STATE_CHANGES:
@ -330,6 +368,35 @@ class ExecutionEngine:
processing
)
# Dump a summary of data found by the plugins
if os.path.exists(plugin_output_dir):
# Print a summary of the logs/data gathers by the plugins
empty_files = ""
logger.info("Plugin Results:\n")
for fn in os.listdir(plugin_output_dir):
filename = os.path.join(plugin_output_dir, fn)
with open(filename, "r+") as f:
# Show how much data is in each plugins output file
if os.path.isfile(filename) and os.path.getsize(filename):
buf = mmap.mmap(f.fileno(), 0)
entries = 0
readline = buf.readline
while readline():
entries += 1
if fn == "system_info":
logger.info(filename)
else:
logger.info("%s has %d entries" %
(filename, entries))
else:
empty_files += fn + " "
if empty_files:
logger.info("\n... nothing found by plugins: %s" % empty_files)
else:
logger.error("Plugin output dir missing: %s" % plugin_output_dir)
sys.exit("... exiting")
# Running the correlator and printing the output from it
self.run_correlator(output_dir, plugin_output_dir)
@ -357,36 +424,46 @@ class ExecutionEngine:
failures.append("\nTotal failures found: " + str(failures_len) + "\n")
events.append("\nTotal events found: " + str(events_len) + "\n")
alarms.append("\nTotal alarms found: " + str(alarms_len) + "\n")
state_changes.append("\nTotal state changes found: "
+ str(state_changes_len) + "\n")
state_changes.append("\nTotal state changes found: " +
str(state_changes_len) + "\n")
# TODO: Put at the end of the report
logger.info("\nRunning correlator... view report at "
+ output_dir)
self._create_output_file("correlator_failures", output_dir,
logger.info("\nCorrelated Results:\n")
self._create_output_file("failures", output_dir,
failures, "")
self._create_output_file("correlator_events", output_dir,
self._create_output_file("events", output_dir,
events, "")
self._create_output_file("correlator_alarms", output_dir,
self._create_output_file("alarms", output_dir,
alarms, "")
self._create_output_file("correlator_state_changes", output_dir,
self._create_output_file("state_changes", output_dir,
state_changes, "")
max = 0
for sl in [events_len, alarms_len, state_changes_len, failures_len]:
if len(str(sl)) > max:
max = len(str(sl))
if not self.opts.verbose:
logger.info("Events : " + str(events_len))
logger.info("Alarms : " + str(alarms_len))
logger.info("State Changes: " + str(state_changes_len))
logger.info("Failures : " + str(failures_len))
logger.info("Events : " + str(events_len) +
" " * (max - len(str(events_len))) +
" " + output_dir + "/events")
logger.info("Alarms : " + str(alarms_len) +
" " * (max - len(str(alarms_len))) +
" " + output_dir + "/alarms")
logger.info("State Changes: " + str(state_changes_len) +
" " * (max - len(str(state_changes_len))) +
" " + output_dir + "/state_changes")
logger.info("Failures : " + str(failures_len) +
" " * (max - len(str(failures_len))) +
" " + output_dir + "/failures")
for f in failures[:-1]:
if "Uncontrolled swact" in f:
logger.info(f[0:19] + " "
+ re.findall("active controller:? (.+)\n",
f)[0] + " uncontrolled swact")
logger.info(f[0:19] + " " +
re.findall("active controller:? (.+)\n",
f)[0] + " uncontrolled swact")
elif "failure on" in f:
host = re.findall(r"failure on ([^\s]+) ", f)[0]
logger.info(f[0:19] + " " + host + " "
+ re.findall("^(.+) failure on ",
f[43:])[0].lower() + " failure")
logger.info(f[0:19] + " " + host + " " +
re.findall("^(.+) failure on ",
f[43:])[0].lower() + " failure")
else:
logger.info(f[:-1])
else:
@ -405,9 +482,9 @@ class ExecutionEngine:
logger.info(k + ": " + str(v) + " time(s)")
logger.info("\nAlarms: " + str(alarms_len))
logger.info("The full list of alarms can be found at "
+ os.path.abspath(output_dir)
+ "/correlator_alarms")
logger.info("The full list of alarms can be found at " +
os.path.abspath(output_dir) +
"/alarms")
# Dictionary to keep track of number of times state changes
# happens on each host
@ -451,8 +528,8 @@ class ExecutionEngine:
else:
file.write(i + "\n")
if self.opts.verbose:
output = ("... output at "
+ os.path.abspath(os.path.join(directory, filename)))
output = ("... output at " +
os.path.abspath(os.path.join(directory, filename)))
if processing == "":
logger.info(output)
else:

@ -1,6 +1,6 @@
########################################################################
#
# Copyright (c) 2022 Wind River Systems, Inc.
# Copyright (c) 2022 - 2023 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
@ -39,6 +39,7 @@ class Plugin:
"files": [],
"hosts": [],
"substring": [],
"exclude": [],
"alarm_exclude": [],
"entity_exclude": [],
"start": None,
@ -78,6 +79,10 @@ class Plugin:
line (string): Line from plugin file to extract
"""
# allow plugins to have empty lines or comments starting with #
if len(line) <= 1 or line[0] == '#':
return
# split string from first '=', left side is label right side is value
data = line.strip().split("=", 1)
if len(data) <= 1:
@ -85,11 +90,17 @@ class Plugin:
label = data[0]
value = data[1]
label = label.replace(" ", "")
# ignore labels that don't start with an alphabetical char
if label[0].isalpha() is False:
raise ValueError("Invalid label value")
try:
if label == "algorithm":
self.state["algorithm"] = value.replace(" ", "")
elif label == "substring":
self.state["substring"].append(data[1])
elif label == "exclude":
self.state["exclude"].append(data[1])
elif label == "hosts":
self.state["hosts"] = value.replace(" ", "").split(",")
elif label == "alarm_exclude":

@ -46,8 +46,8 @@ def audit(start, end, audit_log_path):
# Counts sum of audits from all subclouds
]
INDEX_MIDDLE_WORD = 1
data = [("These rates and totals represent the sum of audits from "
+ "all subclouds")]
data = [("These rates and totals represent the sum of audits " +
"from all subclouds")]
def command(text):

@ -19,7 +19,7 @@ import os
from plugin_algs.substring import substring
def daemon_failures(hosts, start, end):
def daemon_failures(hosts, start, end, exclude_list=None):
"""Daemon failures algorithm
Presents all "Failed to run the puppet manifest" log messages in the system
@ -37,6 +37,6 @@ def daemon_failures(hosts, start, end):
daemon_files.append(daemon_path)
daemon_substrings = ["Failed to run the puppet manifest"]
data = substring(start, end, daemon_substrings, daemon_files)
data = substring(start, end, daemon_substrings, daemon_files, exclude_list)
return sorted(data)

@ -19,7 +19,7 @@ import os
from plugin_algs.substring import substring
def maintenance_errors(hosts, start, end):
def maintenance_errors(hosts, start, end, exclude_list=None):
"""Maintenance errors algorithm
Presents maintenance errors and other relevant log messages in system,
such as "Configuration failure"
@ -51,6 +51,6 @@ def maintenance_errors(hosts, start, end):
"auto recovery disabled",
"Graceful Recovery Failed",
"MNFA ENTER", "MNFA EXIT", "MNFA POOL"]
data = substring(start, end, mtc_substrings, mtc_files)
data = substring(start, end, mtc_substrings, mtc_files, exclude_list)
return sorted(data)

@ -1,6 +1,6 @@
########################################################################
#
# Copyright (c) 2022 Wind River Systems, Inc.
# Copyright (c) 2022 - 2023 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
@ -24,7 +24,7 @@ import subprocess
logger = logging.getLogger(__name__)
def substring(start, end, substr, files):
def substring(start, end, substr, files, exclude_list=None):
"""Substring algorithm
Looks for all substrings in substr within files
@ -33,7 +33,7 @@ def substring(start, end, substr, files):
end (string): End time for analysis
substr (string list): List of substrings to look for
files (string list): List of absolute filepaths to search in
exclude_list (string list): list of strings to exclude from report
Errors:
FileNotFoundError
"""
@ -49,15 +49,17 @@ def substring(start, end, substr, files):
if (re.search("controller-1_(.+)/var/log/mtcAgent.log",
file)):
continue
raise FileNotFoundError(f"File not found: {file}")
else:
data.append("File not found: " + file)
continue
cont = True
# Searching through file
command = (f"""grep -Ea "{'|'.join(s for s in substr)}" """
f"""{file} 2>/dev/null""")
status = _continue(start, end, file)
if (status == CONTINUE_CURRENT
or status == CONTINUE_CURRENT_OLD):
if (status == CONTINUE_CURRENT or
status == CONTINUE_CURRENT_OLD):
# continue with current file
if status == CONTINUE_CURRENT:
cont = False
@ -70,8 +72,8 @@ def substring(start, end, substr, files):
f"""{file}.{n} 2>/dev/null""")
status = _continue(start, end, f"{file}.{n}")
if (status == CONTINUE_CURRENT
or status == CONTINUE_CURRENT_OLD):
if (status == CONTINUE_CURRENT or
status == CONTINUE_CURRENT_OLD):
if status == CONTINUE_CURRENT:
cont = False
_evaluate_substring(start, end, data, command)
@ -85,8 +87,8 @@ def substring(start, end, substr, files):
status = _continue(start, end, f"{file}.{n}.gz",
compressed=True)
if (status == CONTINUE_CURRENT
or status == CONTINUE_CURRENT_OLD):
if (status == CONTINUE_CURRENT or
status == CONTINUE_CURRENT_OLD):
if status == CONTINUE_CURRENT:
cont = False
_evaluate_substring(start, end, data, command)
@ -97,6 +99,17 @@ def substring(start, end, substr, files):
logger.error(e)
continue
# now remove any logs that contain substrings in the exclude_list
if exclude_list:
filtered_data = []
for e in data:
found = False
for exclude in exclude_list:
if e.find(exclude) != -1:
found = True
if found is False:
filtered_data.append(e)
return sorted(filtered_data)
return sorted(data)

@ -1,6 +1,6 @@
########################################################################
#
# Copyright (c) 2022 Wind River Systems, Inc.
# Copyright (c) 2022 -2023 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
@ -17,47 +17,112 @@ import os
import re
def system_info(host_dir):
def system_info(hostname, host_dir, output_dir, hosts, loud=False):
"""System info algorithm
Presents basic information about the system, such as the build type
Parameters:
host_dir (string): path to the collect host dir
hostname (string): make of the host
host_dir (string): path to the collect host dir
output_dir (string): path to the file to store the system info
hosts (string): list of host objects
loud (boolean): when True print system info to stdout
Returns: nothing
"""
data = []
with open(
os.path.join(host_dir, "etc", "platform", "platform.conf")
) as file:
if host_dir is None:
raise ValueError("system_info:No specified host dir")
# from /etc/platform/platform.conf
platform_conf = os.path.join(host_dir, "etc", "platform", "platform.conf")
# ... load the following items first
with open(platform_conf) as file:
for line in file:
if "system_mode" in line:
data.append(
f"System Mode: "
f"{re.match('^system_mode=(.*)', line).group(1)}"
)
val = re.match('^system_mode=(.*)', line).group(1)
data.append(f"System Mode: {val}")
elif "system_type" in line:
data.append(
f"System Type: "
f"{re.match('^system_type=(.*)', line).group(1)}"
)
val = re.match('^system_type=(.*)', line).group(1)
data.append(f"System Type: {val}")
elif "distributed_cloud_role" in line:
role = re.match('^distributed_cloud_role=(.*)',
line).group(1)
data.append(f"Distributed cloud role: {role}")
data.append(f"DC Role : {role}")
elif "sw_version" in line:
data.append(
f"SW Version: "
f"{re.match('^sw_version=(.*)', line).group(1)}"
)
val = re.match('^sw_version=(.*)', line).group(1)
data.append(f"S/W Version: {val}")
# ... followed by these items
with open(platform_conf) as file:
for line in file:
if "nodetype" in line:
val = re.match('^nodetype=(.*)', line).group(1)
data.append(f"Node Type : {val}")
elif "subfunction" in line:
val = re.match('^subfunction=(.*)', line).group(1)
data.append(f"subfunction: {val}")
elif "oam_interface" in line:
val = re.match('^oam_interface=(.*)', line).group(1)
data.append(f"OAM Iface : {val}")
elif "management_interface" in line:
val = re.match('^management_interface=(.*)', line).group(1)
data.append(f"Mgmt Iface : {val}")
elif "cluster_host_interface" in line:
val = re.match('^cluster_host_interface=(.*)', line).group(1)
data.append(f"Clstr Iface: {val}")
# /etc/os-release info
with open(
os.path.join(host_dir, "etc", "os-release")
) as file:
for line in file:
if "PRETTY_NAME" in line:
val = (re.match('^PRETTY_NAME=(.*)', line).group(1))
val = val.strip('\"')
data.append(f"OS Release : {val}")
# /etc/build.info
with open(
os.path.join(host_dir, "etc", "build.info")
) as file:
for line in file:
if "BUILD_TYPE" in line:
data.append(
f"Build Type: "
f"{re.match('^BUILD_TYPE=(.*)', line).group(1)}"
)
elif re.match("^OS=(.*)", line):
data.append(f"OS: {re.match('^OS=(.*)', line).group(1)}")
val = (re.match('^BUILD_TYPE=(.*)', line).group(1))
val = val.strip('\"')
data.append(f"Build Type : {val}")
elif "BUILD_DATE" in line:
val = (re.match('^BUILD_DATE=(.*)', line).group(1))
val = val.strip('\"')
data.append(f"Build Date : {val}")
elif "BUILD_DIR" in line:
val = (re.match('^BUILD_DIR=(.*)', line).group(1))
val = val.strip('\"')
data.append(f"Build Dir : {val}")
return data
with open(output_dir, "a") as file:
dashs = "-" * len(hostname)
file.write("\n" + hostname + "\n" + dashs + "\n")
for i in data:
file.write(i + "\n")
if loud is True:
print(i)
if hosts is not None:
for k, v in hosts.items():
if not len(v.keys()):
continue
if k == "storages":
k += " "
if k == "workers":
k += " "
file.write(f"{k}: {','.join(v.keys())}\n")
if loud is True:
print(f"{k}: {','.join(v.keys())}")
# create an empty line following the system info dump
if loud is True:
print("")
return

@ -1 +1,2 @@
algorithm=maintenance_errors
exclude=task clear

@ -0,0 +1,14 @@
algorithm=substring
files=var/log/sm.log
hosts=controllers
# logs to exclude
substring=ERROR: sm
exclude=Failed to set alarm
exclude=Failed to set log
exclude=Failed to clear alarm
exclude=Failed to get all alarms
exclude=Failed to query service based on pid
exclude=Failed to look up interface name
exclude=Failed to stop service heartbeat thread
exclude=Heartbeat is not required

@ -1,5 +1,5 @@
algorithm=substring
files=var/log/mtcAgent.log, var/log/sm.log
files=var/log/mtcAgent.log
hosts=controllers
substring=operation failed
substring=Failed to send message

@ -446,6 +446,7 @@ bundle_names = []
bundles = []
ignore_list = [analysis_folder_name]
ignore_list += ["apps", "horizon", "lighttpd", "lost+found", "sysinv-tmpdir"]
ignore_list += ["patch-api-proxy-tmpdir", "platform-api-proxy-tmpdir"]
with open(os.path.join(output_dir, "untar.log"), "a") as logfile:
for obj in (os.scandir(input_dir)):
@ -463,6 +464,9 @@ with open(os.path.join(output_dir, "untar.log"), "a") as logfile:
date_time = obj.name[-15:]
if args.debug:
logger.debug("Found Dir : %s : %s", obj.name, date_time)
elif os.path.islink(obj.path):
# ignore sym links
continue
else:
if not tarfile.is_tarfile(obj.path):
continue
@ -559,6 +563,7 @@ elif args.debug:
# create the output directory ; report_analysis
output_dir = os.path.join(path_file, analysis_folder_name)
print("\nReport: %s\n" % output_dir)
if not os.path.exists(output_dir):
os.makedirs(output_dir, exist_ok=True)
@ -567,7 +572,7 @@ try:
engine = ExecutionEngine(args, path_file, output_dir)
except ValueError as e:
logger.error(str(e))
sys.exit("Confirm you are running the report tool on a collect bundle")
logger.error("Confirm you are running the report tool on a collect bundle")
if args.algorithm:
plugins.append(Plugin(opts=vars(args)))

@ -30,7 +30,7 @@ override_dh_auto_install:
install -m 755 -p expect_done $(ROOT)/usr/local/sbin/expect_done
install -m 755 -p mariadb-cli.sh $(ROOT)/usr/local/sbin/mariadb-cli
# Report Tool
# Report Tool
install -m 755 -p report/report.py $(ROOT)/usr/local/bin/report/report.py
install -m 755 -p report/execution_engine.py $(ROOT)/usr/local/bin/report/execution_engine.py
install -m 755 -p report/algorithms.py $(ROOT)/usr/local/bin/report/algorithms.py
@ -38,7 +38,7 @@ override_dh_auto_install:
install -m 755 -p report/correlator.py $(ROOT)/usr/local/bin/report/correlator.py
install -m 755 -p report/README $(ROOT)/usr/local/bin/report/README
# Report Tool Plugin Algorithms
# Report Tool Plugin Algorithms
install -m 755 -p report/plugin_algs/alarm.py $(ROOT)/usr/local/bin/report/plugin_algs/alarm.py
install -m 755 -p report/plugin_algs/audit.py $(ROOT)/usr/local/bin/report/plugin_algs/audit.py
install -m 755 -p report/plugin_algs/daemon_failures.py $(ROOT)/usr/local/bin/report/plugin_algs/daemon_failures.py
@ -51,20 +51,21 @@ override_dh_auto_install:
install -m 755 -p report/plugin_algs/swact_activity.py $(ROOT)/usr/local/bin/report/plugin_algs/swact_activity.py
install -m 755 -p report/plugin_algs/system_info.py $(ROOT)/usr/local/bin/report/plugin_algs/system_info.py
# Report Tool Plugins
# Report Tool Plugins
install -m 755 -p report/plugins/alarm $(ROOT)/usr/local/bin/report/plugins/alarm
install -m 755 -p report/plugins/daemon_failures $(ROOT)/usr/local/bin/report/plugins/daemon_failures
install -m 755 -p report/plugins/heartbeat_loss $(ROOT)/usr/local/bin/report/plugins/heartbeat_loss
install -m 755 -p report/plugins/maintenance_errors $(ROOT)/usr/local/bin/report/plugins/maintenance_errors
install -m 755 -p report/plugins/process_failures $(ROOT)/usr/local/bin/report/plugins/process_failures
install -m 755 -p report/plugins/puppet_errors $(ROOT)/usr/local/bin/report/plugins/puppet_errors
install -m 755 -p report/plugins/sm_errors $(ROOT)/usr/local/bin/report/plugins/sm_errors
install -m 755 -p report/plugins/state_changes $(ROOT)/usr/local/bin/report/plugins/state_changes
install -m 755 -p report/plugins/substring $(ROOT)/usr/local/bin/report/plugins/substring
install -m 755 -p report/plugins/swact_activity $(ROOT)/usr/local/bin/report/plugins/swact_activity
install -m 755 -p report/plugins/system_info $(ROOT)/usr/local/bin/report/plugins/system_info
install -m 755 -p report/plugins/substring_hosts $(SYSCONFDIR)/collect/plugins/substring_hosts
# Collect Plugins
# Collect Plugins
install -m 755 -p collect_sysinv.sh $(SYSCONFDIR)/collect.d/collect_sysinv
install -m 755 -p collect_psqldb.sh $(SYSCONFDIR)/collect.d/collect_psqldb
install -m 755 -p collect_mariadb.sh $(SYSCONFDIR)/collect.d/collect_mariadb