Improve report tool system_info plugin behavior

The current system_info plugin logs the system info for the last
host in host_dirs rather than that of the active controller.

It also does not capture the system info for all the nodes
into its plugin output file.

This update improves the system_info plugin as well implements
the following improvements to rendering and substring handling
improvements.

1. Improve system_info plugin capture and render.

2. Adds which controller was active at the time of the collect
   to the system info rendering output.

3. Improve report analysis rendering by displaying the full
   path to plugin and correlation files.

4. Adds string exclude support to the substring algorithm.
   This allows the generic string rearches like ERROR to be
   searched for and gathered while also allowing specific
   noise logs what are considered noise logs to be filtered out.

5. Create a separate SM errors substring plugin using the new
   exclude option.

6. Adds support for commented and empty lines in the plugins
   This allows for properly commented and formatted plugins.

7. Adds plugin label name error checking
   This allows esier debug of improperly coded plugins.

8. Fixed additional pep8 warnings.

Test Plan:

PASS: Verify on-system collect with --report option
PASS: Verify on-system report generation
PASS: Verify off-system report generation from git
PASS: Verify system_info plugin collects info from all hosts
PASS: Verify report displays system_info from active controller
PASS: Verify handling when no active controller is detected
PASS: Verify new sm_errors substring plugin with excludes
PASS: Verify plugins can have empty or # commented lines
PASS: Verify report tool plugins output include path to each
      plugin file
PASS: Verify report tool correlations include path to each
      correlation file
PASS: Verify report tool plugin label parsing error handling
PASS: Verify all files pass pep8 without warning or error

Story: 2010533
Task: 48072
Change-Id: I6d0253a4c3d8804a5e45b970d766e578ea69368f
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
Eric MacDonald 2023-05-23 17:22:22 +00:00
parent 71d0c40b2d
commit b6343a9e55
13 changed files with 393 additions and 204 deletions

@ -1,6 +1,6 @@
########################################################################
#
# Copyright (c) 2022 Wind River Systems, Inc.
# Copyright (c) 2022 - 2023 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
@ -109,8 +109,8 @@ class Correlator:
ctrlr_link_down = re.findall(
r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3} (.+) "
"sm:", line)[0]
elif (re.search("Neighbor (.+) is now in the down", line)
and start_time and not ctrlr_down):
elif (re.search("Neighbor (.+) is now in the down", line) and
start_time and not ctrlr_down):
ctrlr_down = re.findall(
r"Neighbor \((.+)\) received event", line)[0]
elif (re.search("Service (.+) is failed and has reached max "
@ -121,8 +121,8 @@ class Correlator:
r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3} (.+) sm:",
line)[0]
elif (svc_failed and re.search(
"active-failed\\s+\\| disabling-failed\\s+\\| "
+ svc_failed, line)):
"active-failed\\s+\\| disabling-failed\\s+\\| " +
svc_failed, line)):
if re.search(r"\| go-active-failed\s+\|", line):
go_active_failed = True
else:
@ -140,40 +140,40 @@ class Correlator:
start_time = start_time.strftime("%Y-%m-%dT%H:%M:%S")
end_time = end_time.strftime("%Y-%m-%dT%H:%M:%S")
if link_down:
data.append(start_time + " to " + end_time
+ " Uncontrolled swact, refer to SM logs "
data.append(start_time + " to " + end_time +
" Uncontrolled swact, refer to SM logs "
"for in-depth analysis, original active "
"controller: " + ctrlr_link_down + "\n")
elif ctrlr_down:
if hb_loss:
data.append(start_time + " to " + end_time
+ " Uncontrolled swact due to "
data.append(start_time + " to " + end_time +
" Uncontrolled swact due to "
"spontaneous reset of active "
"controller " + ctrlr_down + "\n")
else:
data.append(start_time + " to " + end_time
+ " Uncontrolled swact likely due to "
data.append(start_time + " to " + end_time +
" Uncontrolled swact likely due to "
"spontaneous reset of active "
"controller " + ctrlr_down + "\n")
elif svc_failed:
if active_failed and go_active_failed:
data.append(start_time + " to " + end_time
+ " Uncontrolled swact due to service "
data.append(start_time + " to " + end_time +
" Uncontrolled swact due to service "
"failure (" + svc_failed + ") twice "
"in 2 minutes was unsuccessful so "
"\"bounced back\" to original active "
"controller " + ctrlr_svc_fail + "\n")
elif active_failed:
data.append(start_time + " to " + end_time
+ " Uncontrolled swact due to service "
data.append(start_time + " to " + end_time +
" Uncontrolled swact due to service "
"failure (" + svc_failed + ") twice "
"in 2 minutes on active controller "
+ ctrlr_svc_fail + "\n")
"in 2 minutes on active controller " +
ctrlr_svc_fail + "\n")
else:
data.append(start_time + " to " + end_time
+ " Uncontrolled swact likely due to "
"service failure (" + svc_failed
+ ") twice in 2 minutes on active "
data.append(start_time + " to " + end_time +
" Uncontrolled swact likely due to "
"service failure (" + svc_failed +
") twice in 2 minutes on active "
"controller " + ctrlr_svc_fail + "\n")
start_time = end_time = svc_failed = None
@ -218,19 +218,19 @@ class Correlator:
host[0] + " auto recovery disabled", line)):
old = data[-1].split("due", 1)
if len(old) == 1:
data[-1] = (data[-1][:-1]
+ " (auto recovery disabled)\n")
data[-1] = (data[-1][:-1] +
" (auto recovery disabled)\n")
else:
data[-1] = (old[0]
+ "(auto recovery disabled) due"
+ old[1])
data[-1] = (old[0] +
"(auto recovery disabled) due" +
old[1])
auto_recov_dis = True
elif "GOENABLED Failed" in line and not goenable_start:
goenable_start, auto_recov_dis = line[0:19], False
goenable_host = re.findall(
"Error : (.+) got GOENABLED Failed", line)[0]
elif ("configuration failed or incomplete" in line
and not config_start):
elif ("configuration failed or incomplete" in line and not
config_start):
config_start = datetime.strptime(line[0:19],
"%Y-%m-%dT%H:%M:%S")
auto_recov_dis = False
@ -248,8 +248,8 @@ class Correlator:
if (re.search(host + " (.+) Heartbeat Loss (.+) "
"\\(during recovery soak\\)", line)):
old = data[-1]
data[-1] = (old[0:23] + line[0:19] + old[42:-1]
+ " (recovery over disabled due to "
data[-1] = (old[0:23] + line[0:19] + old[42:-1] +
" (recovery over disabled due to "
"heartbeat soak failure)\n")
else:
hb_loss_start = line[0:19]
@ -257,15 +257,15 @@ class Correlator:
hb_loss_host = re.findall("Error : (.+) [CM]", line)[0]
# Check if previous failure recorded was heartbeat loss due to
# missing heartbeat messages
elif ("regained MTCALIVE from host that has rebooted" in line
and data and re.search(r"Heartbeat loss failure (.+) "
r"\(recovery over disabled\)",
data[-1])):
elif ("regained MTCALIVE from host that rebooted" in line and
data and re.search(
r"Heartbeat loss failure (.+) "
r"\(recovery over disabled\)", data[-1])):
host = re.findall("failure on (.+) due to", data[-1])[0]
if re.search(host + " regained MTCALIVE", line):
old = data[-1].split("due", 1)[0]
data[-1] = (old[0:23] + line[0:19] + old[42:]
+ "due to uncontrolled reboot\n")
data[-1] = (old[0:23] + line[0:19] + old[42:] +
"due to uncontrolled reboot\n")
elif (hb_loss_start and not comm_loss and hb_loss_host and
re.search(hb_loss_host + " Loss Of Communication for 5 "
"seconds", line)):
@ -282,14 +282,14 @@ class Correlator:
"threshold reached", line)):
goenable_end = line[0:19]
if goenable_tst_f:
data.append(goenable_start + " to " + goenable_end
+ " Go-enable test failure on "
+ goenable_host + " due to failing of "
+ goenable_tst_f + "\n")
data.append(goenable_start + " to " + goenable_end +
" Go-enable test failure on " +
goenable_host + " due to failing of " +
goenable_tst_f + "\n")
else:
data.append(goenable_start + " to " + goenable_end
+ " Go-enable test failure on "
+ goenable_host + " due to unknown test "
data.append(goenable_start + " to " + goenable_end +
" Go-enable test failure on " +
goenable_host + " due to unknown test "
"failing\n")
goenable_start = goenable_end = goenable_host = None
@ -299,8 +299,8 @@ class Correlator:
"threshold reached", line)):
config_end = datetime.strptime(line[0:19],
"%Y-%m-%dT%H:%M:%S")
if (config_tst_f
!= "/etc/goenabled.d/config_goenabled_check.sh"):
if (config_tst_f !=
"/etc/goenabled.d/config_goenabled_check.sh"):
try:
daemon_fail = self.search_daemon_fail(
config_start, config_end, config_host)
@ -308,8 +308,8 @@ class Correlator:
logger.error(e)
if (config_tst_f ==
"/etc/goenabled.d/config_goenabled_check.sh"
or daemon_fail):
"/etc/goenabled.d/config_goenabled_check.sh" or
daemon_fail):
try:
puppet_error = self.search_puppet_error(
config_start, config_end)
@ -320,22 +320,22 @@ class Correlator:
"%Y-%m-%dT%H:%M:%S")
config_end = config_end.strftime("%Y-%m-%dT%H:%M:%S")
if puppet_error:
data.append(config_start + " to " + config_end
+ " Configuration failure on "
+ config_host + " due to:\n"
+ puppet_error)
data.append(config_start + " to " + config_end +
" Configuration failure on " +
config_host + " due to:\n" +
puppet_error)
else:
data.append(config_start + " to " + config_end
+ " Configuration failure on "
+ config_host
+ " due to unknown cause\n")
data.append(config_start + " to " + config_end +
" Configuration failure on " +
config_host +
" due to unknown cause\n")
else:
config_start = config_start.strftime(
"%Y-%m-%dT%H:%M:%S")
config_end = config_end.strftime("%Y-%m-%dT%H:%M:%S")
data.append(config_start + " to " + config_end
+ " Possible configuration failure on "
+ config_host + "\n")
data.append(config_start + " to " + config_end +
" Possible configuration failure on " +
config_host + "\n")
config_start = config_end = config_host = None
config_tst_f = puppet_error = None
@ -344,9 +344,9 @@ class Correlator:
re.search(hb_loss_host + " Connectivity Recovered ",
line)):
hb_loss_end = line[0:19]
data.append(hb_loss_start + " to " + hb_loss_end
+ " Heartbeat loss failure on " + hb_loss_host
+ " due to too many missing heartbeat "
data.append(hb_loss_start + " to " + hb_loss_end +
" Heartbeat loss failure on " + hb_loss_host +
" due to too many missing heartbeat "
"messages\n")
hb_loss_start = hb_loss_end = hb_loss_host = None
@ -355,9 +355,9 @@ class Correlator:
hb_loss_host and re.search(
hb_loss_host + " Graceful Recovery Wait", line)):
hb_loss_end = line[0:19]
data.append(hb_loss_start + " to " + hb_loss_end
+ " Heartbeat loss failure on " + hb_loss_host
+ " due to too many missing heartbeat "
data.append(hb_loss_start + " to " + hb_loss_end +
" Heartbeat loss failure on " + hb_loss_host +
" due to too many missing heartbeat "
"messages (recovery over disabled)\n")
hb_loss_start = hb_loss_end = hb_loss_host = None
@ -383,8 +383,8 @@ class Correlator:
if (re.search("Error : " + host + " (.+) Heartbeat Loss ",
line)):
date = datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S")
if (date >= start_time - timedelta(minutes=1)
and date <= end_time):
if (date >= start_time - timedelta(minutes=1) and
date <= end_time):
hb_loss = True
break
@ -405,12 +405,12 @@ class Correlator:
with open(file_path, "r") as daemon_failures:
for line in daemon_failures:
if (re.search("\\d " + host
+ " (.+) Failed to run the puppet manifest",
if (re.search("\\d " + host +
" (.+) Failed to run the puppet manifest",
line)):
date = datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S")
if (date >= start_time - timedelta(seconds=10)
and date <= end_time):
if (date >= start_time - timedelta(seconds=10) and
date <= end_time):
daemon_fail = True
break
@ -433,8 +433,8 @@ class Correlator:
for line in puppet_errors:
if "Error: " in line:
date = datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S")
if (date >= start_time - timedelta(seconds=10)
and date <= end_time):
if (date >= start_time - timedelta(seconds=10) and
date <= end_time):
puppet_log = line
break
@ -460,13 +460,13 @@ class Correlator:
if "force failed by SM" in line:
host = re.findall("Error : (.+) is being", line)[0]
if hostname == "all" or host == hostname:
data.append(line[0:19] + " " + host
+ " force failed by SM\n")
data.append(line[0:19] + " " + host +
" force failed by SM\n")
elif "Graceful Recovery Failed" in line:
host = re.findall("Info : (.+) Task:", line)[0]
if hostname == "all" or host == hostname:
data.append(line[0:19] + " " + host
+ " graceful recovery failed\n")
data.append(line[0:19] + " " + host +
" graceful recovery failed\n")
elif "MNFA ENTER" in line:
mnfa_start = datetime.strptime(line[0:19],
"%Y-%m-%dT%H:%M:%S")
@ -487,9 +487,9 @@ class Correlator:
"%Y-%m-%dT%H:%M:%S")
mnfa_duration -= mnfa_start
mnfa_start = mnfa_start.strftime("%Y-%m-%dT%H:%M:%S")
data.append(mnfa_start + " Multi-node failure avoidance "
+ "(duration: " + str(mnfa_duration)
+ "; history:" + mnfa_hist + ")\n")
data.append(mnfa_start + " Multi-node failure avoidance " +
"(duration: " + str(mnfa_duration) +
"; history:" + mnfa_hist + ")\n")
mnfa_start, mnfa_hist = None, ""
@ -506,9 +506,9 @@ class Correlator:
svc_failed = re.findall(
r"Service \((.+)\) is failed", line)[0]
if hostname == "all" or host == hostname:
data.append(line[0:19] + " " + host
+ " service failure (" + svc_failed
+ ")\n")
data.append(line[0:19] + " " + host +
" service failure (" + svc_failed +
")\n")
return data
@ -524,7 +524,9 @@ class Correlator:
# Open 'alarm' output file from alarm plugin and read it
file_path = os.path.join(self.plugin_output_dir, "alarm")
if not os.path.exists(file_path):
logger.debug("No alarms found")
return data
with open(file_path, "r") as alarm:
extract = False
for line in alarm:
@ -547,8 +549,8 @@ class Correlator:
temp = []
for entry in data:
temp.append(entry["name"] + " - set: " + str(entry["set"])
+ ", clear: " + str(entry["clear"]) + "\n")
temp.append(entry["name"] + " - set: " + str(entry["set"]) +
", clear: " + str(entry["clear"]) + "\n")
data = temp
return data
@ -572,8 +574,8 @@ class Correlator:
host = re.findall("Info : (.+) is ENABLED", line)[0]
state = re.findall("is (.+)\n", line)[0].lower()
if hostname == "all" or hostname in host:
data.append(line[0:19] + " " + host + " " + state
+ "\n")
data.append(line[0:19] + " " + host + " " +
state + "\n")
elif "locked-disabled" in line:
host = re.findall(
"Info : (.+) u?n?locked-disabled", line)[0]

@ -1,6 +1,6 @@
########################################################################
#
# Copyright (c) 2022 Wind River Systems, Inc.
# Copyright (c) 2022 - 2023 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
@ -23,6 +23,7 @@
########################################################################
import logging
import mmap
import os
import re
import subprocess
@ -48,6 +49,10 @@ sys.dont_write_bytecode = True
logger = logging.getLogger(__name__)
# regex expression used to get the hostname from the host dir name
# eg: chops '_20221201.213332' off of controller-0_20221201.213332
regex_chop_bundle_date = r"_\d{8}\.\d{6}"
class ExecutionEngine:
def __init__(self, opts, input_dir, output_dir):
@ -62,7 +67,9 @@ class ExecutionEngine:
self.opts = opts
self.hosts = {"controllers": {}, "workers": {}, "storages": {}}
self.active_controller_directory = None
self.active_controller_hostname = None
self.host_dirs = []
self.hostnames = []
if not os.path.isdir(input_dir):
logger.error("Error: Invalid input directory: %s", input_dir)
@ -89,37 +96,48 @@ class ExecutionEngine:
except subprocess.CalledProcessError as e:
logger.error(e)
# TODO: Need a better way to figure out the active controller
# Consider getting the system infop from all hosts.
#
# Determine the active controller and load system info from it.
for folder in (f.path for f in os.scandir(input_dir)):
logger.debug("folder: %s", os.path.basename(folder))
# skip over the tarballs
logger.debug("base folder: %s", os.path.basename(folder))
# skip over files (the tarballs)
if not os.path.isdir(folder):
continue
basename = os.path.basename(folder)
if basename == "report_analysis":
continue
# Get the hostname from the host folder
hostname = re.sub(regex_chop_bundle_date, "", basename)
self.hostnames.append(hostname)
logger.debug("searching for active controller: %s" % hostname)
host_dir = folder
extra_path = os.path.join(host_dir, "var", "extra")
database_path = os.path.join(host_dir, extra_path, "database")
host_info_path = os.path.join(host_dir, extra_path, "host.info")
if os.path.isdir(host_dir):
extra_path = os.path.join(host_dir, "var", "extra")
# don't analyse a directory that doesn't contain
# a 'var/extra' dir.
if not os.path.exists(extra_path):
logger.warning("missing var/extra for %s" % hostname)
continue
database_path = os.path.join(host_dir, extra_path, "database")
hostinfo_path = os.path.join(host_dir, extra_path, "host.info")
if os.path.exists(database_path):
if os.listdir(database_path):
logger.info("Active Ctrl: %s" % hostname)
self.active_controller_directory = folder
self.active_controller_hostname = hostname
self.host_dirs.append(host_dir)
logger.debug("Host Dirs: %s", self.host_dirs)
if os.path.exists(host_info_path):
# save host folder path based on nodetype
if os.path.exists(hostinfo_path):
hostname, subfunction = self._extract_subfunction(
host_info_path)
hostinfo_path)
if "controller" in subfunction:
self.hosts["controllers"][hostname] = folder
elif "worker" in subfunction:
@ -127,9 +145,8 @@ class ExecutionEngine:
elif "storage" in subfunction:
self.hosts["storages"][hostname] = folder
self.active_controller_directory = folder
if not self.active_controller_directory:
raise ValueError("Active controller not found")
logger.error("Active Ctrl: NOT FOUND")
def execute(self, plugins, output_dir):
"""Run a list of plugins
@ -178,6 +195,7 @@ class ExecutionEngine:
os.path.join(folderpath, file)
for file in plugin.state["files"]
],
plugin.state["exclude"],
)
# creating output file
@ -186,37 +204,56 @@ class ExecutionEngine:
f"substring_{hostname}",
)
if self.opts.verbose:
logger.info("... output at "
+ os.path.abspath(output_file))
with open(output_file, "w") as file:
file.write(
f"Date range: {self.opts.start} until "
f"{self.opts.end}\n"
)
file.write(
f"substrings: "
f"{' '.join(plugin.state['substring'])}\n"
)
for line in events:
if line[-1] == "\n":
file.write(line)
else:
file.write(line + "\n")
logger.info("... output at " +
os.path.abspath(output_file))
if events:
with open(output_file, "w") as file:
file.write(
f"Date range: {self.opts.start} until "
f"{self.opts.end}\n"
)
file.write(
f"substrings: "
f"{' '.join(plugin.state['substring'])}\n"
)
for line in events:
if line[-1] == "\n":
file.write(line)
else:
file.write(line + "\n")
else:
if plugin.state["algorithm"] == algorithms.SYSTEM_INFO:
for host_dir in self.host_dirs:
info = system_info(host_dir)
system_info_output = os.path.join(plugin_output_dir,
"system_info")
with open(system_info_output, "w") as file:
for i in info:
file.write(i + "\n")
for k, v in self.hosts.items():
file.write(f"{k}: {','.join(v.keys())}\n")
# Get system info of the active controller first
# and then put the system info of each host in the
# system info output folder.
system_info_output = os.path.join(plugin_output_dir,
"system_info")
if os.path.exists(system_info_output):
os.remove(system_info_output)
if self.opts.verbose:
logger.info(processing + ", output at " +
os.path.abspath(system_info_output))
hostname = None
host_dir = None
if self.active_controller_directory is None:
hostname = re.sub(regex_chop_bundle_date, "",
os.path.basename(self.host_dirs[0]))
host_dir = self.host_dirs[0]
else:
hostname = self.active_controller_hostname
host_dir = self.active_controller_directory
system_info(hostname, host_dir,
system_info_output,
self.hosts, True)
for host_dir in self.host_dirs:
if host_dir != self.active_controller_directory:
hostname = re.sub(regex_chop_bundle_date, "",
os.path.basename(host_dir))
system_info(hostname,
host_dir,
system_info_output,
None,
False)
elif plugin.state["algorithm"] == algorithms.AUDIT:
hosts = {}
@ -284,7 +321,6 @@ class ExecutionEngine:
file.write(f"{k}:\n")
for date in v["dates"]:
file.write(f" {date}\n")
# creating output log file
with open(log_output, "w") as file:
for k, v in logs.items():
@ -312,14 +348,16 @@ class ExecutionEngine:
self._create_output_file(
"maintenance_errors", plugin_output_dir,
maintenance_errors(self.hosts, self.opts.start,
self.opts.end),
self.opts.end,
plugin.state["exclude"]),
processing
)
elif plugin.state["algorithm"] == algorithms.DAEMON_FAILURES:
self._create_output_file(
"daemon_failures", plugin_output_dir,
daemon_failures(self.hosts, self.opts.start,
self.opts.end),
self.opts.end,
plugin.state["exclude"]),
processing
)
elif plugin.state["algorithm"] == algorithms.STATE_CHANGES:
@ -330,6 +368,35 @@ class ExecutionEngine:
processing
)
# Dump a summary of data found by the plugins
if os.path.exists(plugin_output_dir):
# Print a summary of the logs/data gathers by the plugins
empty_files = ""
logger.info("Plugin Results:\n")
for fn in os.listdir(plugin_output_dir):
filename = os.path.join(plugin_output_dir, fn)
with open(filename, "r+") as f:
# Show how much data is in each plugins output file
if os.path.isfile(filename) and os.path.getsize(filename):
buf = mmap.mmap(f.fileno(), 0)
entries = 0
readline = buf.readline
while readline():
entries += 1
if fn == "system_info":
logger.info(filename)
else:
logger.info("%s has %d entries" %
(filename, entries))
else:
empty_files += fn + " "
if empty_files:
logger.info("\n... nothing found by plugins: %s" % empty_files)
else:
logger.error("Plugin output dir missing: %s" % plugin_output_dir)
sys.exit("... exiting")
# Running the correlator and printing the output from it
self.run_correlator(output_dir, plugin_output_dir)
@ -357,36 +424,46 @@ class ExecutionEngine:
failures.append("\nTotal failures found: " + str(failures_len) + "\n")
events.append("\nTotal events found: " + str(events_len) + "\n")
alarms.append("\nTotal alarms found: " + str(alarms_len) + "\n")
state_changes.append("\nTotal state changes found: "
+ str(state_changes_len) + "\n")
state_changes.append("\nTotal state changes found: " +
str(state_changes_len) + "\n")
# TODO: Put at the end of the report
logger.info("\nRunning correlator... view report at "
+ output_dir)
self._create_output_file("correlator_failures", output_dir,
logger.info("\nCorrelated Results:\n")
self._create_output_file("failures", output_dir,
failures, "")
self._create_output_file("correlator_events", output_dir,
self._create_output_file("events", output_dir,
events, "")
self._create_output_file("correlator_alarms", output_dir,
self._create_output_file("alarms", output_dir,
alarms, "")
self._create_output_file("correlator_state_changes", output_dir,
self._create_output_file("state_changes", output_dir,
state_changes, "")
max = 0
for sl in [events_len, alarms_len, state_changes_len, failures_len]:
if len(str(sl)) > max:
max = len(str(sl))
if not self.opts.verbose:
logger.info("Events : " + str(events_len))
logger.info("Alarms : " + str(alarms_len))
logger.info("State Changes: " + str(state_changes_len))
logger.info("Failures : " + str(failures_len))
logger.info("Events : " + str(events_len) +
" " * (max - len(str(events_len))) +
" " + output_dir + "/events")
logger.info("Alarms : " + str(alarms_len) +
" " * (max - len(str(alarms_len))) +
" " + output_dir + "/alarms")
logger.info("State Changes: " + str(state_changes_len) +
" " * (max - len(str(state_changes_len))) +
" " + output_dir + "/state_changes")
logger.info("Failures : " + str(failures_len) +
" " * (max - len(str(failures_len))) +
" " + output_dir + "/failures")
for f in failures[:-1]:
if "Uncontrolled swact" in f:
logger.info(f[0:19] + " "
+ re.findall("active controller:? (.+)\n",
f)[0] + " uncontrolled swact")
logger.info(f[0:19] + " " +
re.findall("active controller:? (.+)\n",
f)[0] + " uncontrolled swact")
elif "failure on" in f:
host = re.findall(r"failure on ([^\s]+) ", f)[0]
logger.info(f[0:19] + " " + host + " "
+ re.findall("^(.+) failure on ",
f[43:])[0].lower() + " failure")
logger.info(f[0:19] + " " + host + " " +
re.findall("^(.+) failure on ",
f[43:])[0].lower() + " failure")
else:
logger.info(f[:-1])
else:
@ -405,9 +482,9 @@ class ExecutionEngine:
logger.info(k + ": " + str(v) + " time(s)")
logger.info("\nAlarms: " + str(alarms_len))
logger.info("The full list of alarms can be found at "
+ os.path.abspath(output_dir)
+ "/correlator_alarms")
logger.info("The full list of alarms can be found at " +
os.path.abspath(output_dir) +
"/alarms")
# Dictionary to keep track of number of times state changes
# happens on each host
@ -451,8 +528,8 @@ class ExecutionEngine:
else:
file.write(i + "\n")
if self.opts.verbose:
output = ("... output at "
+ os.path.abspath(os.path.join(directory, filename)))
output = ("... output at " +
os.path.abspath(os.path.join(directory, filename)))
if processing == "":
logger.info(output)
else:

@ -1,6 +1,6 @@
########################################################################
#
# Copyright (c) 2022 Wind River Systems, Inc.
# Copyright (c) 2022 - 2023 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
@ -39,6 +39,7 @@ class Plugin:
"files": [],
"hosts": [],
"substring": [],
"exclude": [],
"alarm_exclude": [],
"entity_exclude": [],
"start": None,
@ -78,6 +79,10 @@ class Plugin:
line (string): Line from plugin file to extract
"""
# allow plugins to have empty lines or comments starting with #
if len(line) <= 1 or line[0] == '#':
return
# split string from first '=', left side is label right side is value
data = line.strip().split("=", 1)
if len(data) <= 1:
@ -85,11 +90,17 @@ class Plugin:
label = data[0]
value = data[1]
label = label.replace(" ", "")
# ignore labels that don't start with an alphabetical char
if label[0].isalpha() is False:
raise ValueError("Invalid label value")
try:
if label == "algorithm":
self.state["algorithm"] = value.replace(" ", "")
elif label == "substring":
self.state["substring"].append(data[1])
elif label == "exclude":
self.state["exclude"].append(data[1])
elif label == "hosts":
self.state["hosts"] = value.replace(" ", "").split(",")
elif label == "alarm_exclude":

@ -46,8 +46,8 @@ def audit(start, end, audit_log_path):
# Counts sum of audits from all subclouds
]
INDEX_MIDDLE_WORD = 1
data = [("These rates and totals represent the sum of audits from "
+ "all subclouds")]
data = [("These rates and totals represent the sum of audits " +
"from all subclouds")]
def command(text):

@ -19,7 +19,7 @@ import os
from plugin_algs.substring import substring
def daemon_failures(hosts, start, end):
def daemon_failures(hosts, start, end, exclude_list=None):
"""Daemon failures algorithm
Presents all "Failed to run the puppet manifest" log messages in the system
@ -37,6 +37,6 @@ def daemon_failures(hosts, start, end):
daemon_files.append(daemon_path)
daemon_substrings = ["Failed to run the puppet manifest"]
data = substring(start, end, daemon_substrings, daemon_files)
data = substring(start, end, daemon_substrings, daemon_files, exclude_list)
return sorted(data)

@ -19,7 +19,7 @@ import os
from plugin_algs.substring import substring
def maintenance_errors(hosts, start, end):
def maintenance_errors(hosts, start, end, exclude_list=None):
"""Maintenance errors algorithm
Presents maintenance errors and other relevant log messages in system,
such as "Configuration failure"
@ -51,6 +51,6 @@ def maintenance_errors(hosts, start, end):
"auto recovery disabled",
"Graceful Recovery Failed",
"MNFA ENTER", "MNFA EXIT", "MNFA POOL"]
data = substring(start, end, mtc_substrings, mtc_files)
data = substring(start, end, mtc_substrings, mtc_files, exclude_list)
return sorted(data)

@ -1,6 +1,6 @@
########################################################################
#
# Copyright (c) 2022 Wind River Systems, Inc.
# Copyright (c) 2022 - 2023 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
@ -24,7 +24,7 @@ import subprocess
logger = logging.getLogger(__name__)
def substring(start, end, substr, files):
def substring(start, end, substr, files, exclude_list=None):
"""Substring algorithm
Looks for all substrings in substr within files
@ -33,7 +33,7 @@ def substring(start, end, substr, files):
end (string): End time for analysis
substr (string list): List of substrings to look for
files (string list): List of absolute filepaths to search in
exclude_list (string list): list of strings to exclude from report
Errors:
FileNotFoundError
"""
@ -49,15 +49,17 @@ def substring(start, end, substr, files):
if (re.search("controller-1_(.+)/var/log/mtcAgent.log",
file)):
continue
raise FileNotFoundError(f"File not found: {file}")
else:
data.append("File not found: " + file)
continue
cont = True
# Searching through file
command = (f"""grep -Ea "{'|'.join(s for s in substr)}" """
f"""{file} 2>/dev/null""")
status = _continue(start, end, file)
if (status == CONTINUE_CURRENT
or status == CONTINUE_CURRENT_OLD):
if (status == CONTINUE_CURRENT or
status == CONTINUE_CURRENT_OLD):
# continue with current file
if status == CONTINUE_CURRENT:
cont = False
@ -70,8 +72,8 @@ def substring(start, end, substr, files):
f"""{file}.{n} 2>/dev/null""")
status = _continue(start, end, f"{file}.{n}")
if (status == CONTINUE_CURRENT
or status == CONTINUE_CURRENT_OLD):
if (status == CONTINUE_CURRENT or
status == CONTINUE_CURRENT_OLD):
if status == CONTINUE_CURRENT:
cont = False
_evaluate_substring(start, end, data, command)
@ -85,8 +87,8 @@ def substring(start, end, substr, files):
status = _continue(start, end, f"{file}.{n}.gz",
compressed=True)
if (status == CONTINUE_CURRENT
or status == CONTINUE_CURRENT_OLD):
if (status == CONTINUE_CURRENT or
status == CONTINUE_CURRENT_OLD):
if status == CONTINUE_CURRENT:
cont = False
_evaluate_substring(start, end, data, command)
@ -97,6 +99,17 @@ def substring(start, end, substr, files):
logger.error(e)
continue
# now remove any logs that contain substrings in the exclude_list
if exclude_list:
filtered_data = []
for e in data:
found = False
for exclude in exclude_list:
if e.find(exclude) != -1:
found = True
if found is False:
filtered_data.append(e)
return sorted(filtered_data)
return sorted(data)

@ -1,6 +1,6 @@
########################################################################
#
# Copyright (c) 2022 Wind River Systems, Inc.
# Copyright (c) 2022 -2023 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
@ -17,47 +17,112 @@ import os
import re
def system_info(host_dir):
def system_info(hostname, host_dir, output_dir, hosts, loud=False):
"""System info algorithm
Presents basic information about the system, such as the build type
Parameters:
host_dir (string): path to the collect host dir
hostname (string): make of the host
host_dir (string): path to the collect host dir
output_dir (string): path to the file to store the system info
hosts (string): list of host objects
loud (boolean): when True print system info to stdout
Returns: nothing
"""
data = []
with open(
os.path.join(host_dir, "etc", "platform", "platform.conf")
) as file:
if host_dir is None:
raise ValueError("system_info:No specified host dir")
# from /etc/platform/platform.conf
platform_conf = os.path.join(host_dir, "etc", "platform", "platform.conf")
# ... load the following items first
with open(platform_conf) as file:
for line in file:
if "system_mode" in line:
data.append(
f"System Mode: "
f"{re.match('^system_mode=(.*)', line).group(1)}"
)
val = re.match('^system_mode=(.*)', line).group(1)
data.append(f"System Mode: {val}")
elif "system_type" in line:
data.append(
f"System Type: "
f"{re.match('^system_type=(.*)', line).group(1)}"
)
val = re.match('^system_type=(.*)', line).group(1)
data.append(f"System Type: {val}")
elif "distributed_cloud_role" in line:
role = re.match('^distributed_cloud_role=(.*)',
line).group(1)
data.append(f"Distributed cloud role: {role}")
data.append(f"DC Role : {role}")
elif "sw_version" in line:
data.append(
f"SW Version: "
f"{re.match('^sw_version=(.*)', line).group(1)}"
)
val = re.match('^sw_version=(.*)', line).group(1)
data.append(f"S/W Version: {val}")
# ... followed by these items
with open(platform_conf) as file:
for line in file:
if "nodetype" in line:
val = re.match('^nodetype=(.*)', line).group(1)
data.append(f"Node Type : {val}")
elif "subfunction" in line:
val = re.match('^subfunction=(.*)', line).group(1)
data.append(f"subfunction: {val}")
elif "oam_interface" in line:
val = re.match('^oam_interface=(.*)', line).group(1)
data.append(f"OAM Iface : {val}")
elif "management_interface" in line:
val = re.match('^management_interface=(.*)', line).group(1)
data.append(f"Mgmt Iface : {val}")
elif "cluster_host_interface" in line:
val = re.match('^cluster_host_interface=(.*)', line).group(1)
data.append(f"Clstr Iface: {val}")
# /etc/os-release info
with open(
os.path.join(host_dir, "etc", "os-release")
) as file:
for line in file:
if "PRETTY_NAME" in line:
val = (re.match('^PRETTY_NAME=(.*)', line).group(1))
val = val.strip('\"')
data.append(f"OS Release : {val}")
# /etc/build.info
with open(
os.path.join(host_dir, "etc", "build.info")
) as file:
for line in file:
if "BUILD_TYPE" in line:
data.append(
f"Build Type: "
f"{re.match('^BUILD_TYPE=(.*)', line).group(1)}"
)
elif re.match("^OS=(.*)", line):
data.append(f"OS: {re.match('^OS=(.*)', line).group(1)}")
val = (re.match('^BUILD_TYPE=(.*)', line).group(1))
val = val.strip('\"')
data.append(f"Build Type : {val}")
elif "BUILD_DATE" in line:
val = (re.match('^BUILD_DATE=(.*)', line).group(1))
val = val.strip('\"')
data.append(f"Build Date : {val}")
elif "BUILD_DIR" in line:
val = (re.match('^BUILD_DIR=(.*)', line).group(1))
val = val.strip('\"')
data.append(f"Build Dir : {val}")
return data
with open(output_dir, "a") as file:
dashs = "-" * len(hostname)
file.write("\n" + hostname + "\n" + dashs + "\n")
for i in data:
file.write(i + "\n")
if loud is True:
print(i)
if hosts is not None:
for k, v in hosts.items():
if not len(v.keys()):
continue
if k == "storages":
k += " "
if k == "workers":
k += " "
file.write(f"{k}: {','.join(v.keys())}\n")
if loud is True:
print(f"{k}: {','.join(v.keys())}")
# create an empty line following the system info dump
if loud is True:
print("")
return

@ -1 +1,2 @@
algorithm=maintenance_errors
exclude=task clear

@ -0,0 +1,14 @@
algorithm=substring
files=var/log/sm.log
hosts=controllers
# logs to exclude
substring=ERROR: sm
exclude=Failed to set alarm
exclude=Failed to set log
exclude=Failed to clear alarm
exclude=Failed to get all alarms
exclude=Failed to query service based on pid
exclude=Failed to look up interface name
exclude=Failed to stop service heartbeat thread
exclude=Heartbeat is not required

@ -1,5 +1,5 @@
algorithm=substring
files=var/log/mtcAgent.log, var/log/sm.log
files=var/log/mtcAgent.log
hosts=controllers
substring=operation failed
substring=Failed to send message

@ -446,6 +446,7 @@ bundle_names = []
bundles = []
ignore_list = [analysis_folder_name]
ignore_list += ["apps", "horizon", "lighttpd", "lost+found", "sysinv-tmpdir"]
ignore_list += ["patch-api-proxy-tmpdir", "platform-api-proxy-tmpdir"]
with open(os.path.join(output_dir, "untar.log"), "a") as logfile:
for obj in (os.scandir(input_dir)):
@ -463,6 +464,9 @@ with open(os.path.join(output_dir, "untar.log"), "a") as logfile:
date_time = obj.name[-15:]
if args.debug:
logger.debug("Found Dir : %s : %s", obj.name, date_time)
elif os.path.islink(obj.path):
# ignore sym links
continue
else:
if not tarfile.is_tarfile(obj.path):
continue
@ -559,6 +563,7 @@ elif args.debug:
# create the output directory ; report_analysis
output_dir = os.path.join(path_file, analysis_folder_name)
print("\nReport: %s\n" % output_dir)
if not os.path.exists(output_dir):
os.makedirs(output_dir, exist_ok=True)
@ -567,7 +572,7 @@ try:
engine = ExecutionEngine(args, path_file, output_dir)
except ValueError as e:
logger.error(str(e))
sys.exit("Confirm you are running the report tool on a collect bundle")
logger.error("Confirm you are running the report tool on a collect bundle")
if args.algorithm:
plugins.append(Plugin(opts=vars(args)))

@ -30,7 +30,7 @@ override_dh_auto_install:
install -m 755 -p expect_done $(ROOT)/usr/local/sbin/expect_done
install -m 755 -p mariadb-cli.sh $(ROOT)/usr/local/sbin/mariadb-cli
# Report Tool
# Report Tool
install -m 755 -p report/report.py $(ROOT)/usr/local/bin/report/report.py
install -m 755 -p report/execution_engine.py $(ROOT)/usr/local/bin/report/execution_engine.py
install -m 755 -p report/algorithms.py $(ROOT)/usr/local/bin/report/algorithms.py
@ -38,7 +38,7 @@ override_dh_auto_install:
install -m 755 -p report/correlator.py $(ROOT)/usr/local/bin/report/correlator.py
install -m 755 -p report/README $(ROOT)/usr/local/bin/report/README
# Report Tool Plugin Algorithms
# Report Tool Plugin Algorithms
install -m 755 -p report/plugin_algs/alarm.py $(ROOT)/usr/local/bin/report/plugin_algs/alarm.py
install -m 755 -p report/plugin_algs/audit.py $(ROOT)/usr/local/bin/report/plugin_algs/audit.py
install -m 755 -p report/plugin_algs/daemon_failures.py $(ROOT)/usr/local/bin/report/plugin_algs/daemon_failures.py
@ -51,20 +51,21 @@ override_dh_auto_install:
install -m 755 -p report/plugin_algs/swact_activity.py $(ROOT)/usr/local/bin/report/plugin_algs/swact_activity.py
install -m 755 -p report/plugin_algs/system_info.py $(ROOT)/usr/local/bin/report/plugin_algs/system_info.py
# Report Tool Plugins
# Report Tool Plugins
install -m 755 -p report/plugins/alarm $(ROOT)/usr/local/bin/report/plugins/alarm
install -m 755 -p report/plugins/daemon_failures $(ROOT)/usr/local/bin/report/plugins/daemon_failures
install -m 755 -p report/plugins/heartbeat_loss $(ROOT)/usr/local/bin/report/plugins/heartbeat_loss
install -m 755 -p report/plugins/maintenance_errors $(ROOT)/usr/local/bin/report/plugins/maintenance_errors
install -m 755 -p report/plugins/process_failures $(ROOT)/usr/local/bin/report/plugins/process_failures
install -m 755 -p report/plugins/puppet_errors $(ROOT)/usr/local/bin/report/plugins/puppet_errors
install -m 755 -p report/plugins/sm_errors $(ROOT)/usr/local/bin/report/plugins/sm_errors
install -m 755 -p report/plugins/state_changes $(ROOT)/usr/local/bin/report/plugins/state_changes
install -m 755 -p report/plugins/substring $(ROOT)/usr/local/bin/report/plugins/substring
install -m 755 -p report/plugins/swact_activity $(ROOT)/usr/local/bin/report/plugins/swact_activity
install -m 755 -p report/plugins/system_info $(ROOT)/usr/local/bin/report/plugins/system_info
install -m 755 -p report/plugins/substring_hosts $(SYSCONFDIR)/collect/plugins/substring_hosts
# Collect Plugins
# Collect Plugins
install -m 755 -p collect_sysinv.sh $(SYSCONFDIR)/collect.d/collect_sysinv
install -m 755 -p collect_psqldb.sh $(SYSCONFDIR)/collect.d/collect_psqldb
install -m 755 -p collect_mariadb.sh $(SYSCONFDIR)/collect.d/collect_mariadb