From b6343a9e55e890c8f8e5161599ee66a4e4960db9 Mon Sep 17 00:00:00 2001 From: Eric MacDonald <eric.macdonald@windriver.com> Date: Tue, 23 May 2023 17:22:22 +0000 Subject: [PATCH] Improve report tool system_info plugin behavior The current system_info plugin logs the system info for the last host in host_dirs rather than that of the active controller. It also does not capture the system info for all the nodes into its plugin output file. This update improves the system_info plugin as well implements the following improvements to rendering and substring handling improvements. 1. Improve system_info plugin capture and render. 2. Adds which controller was active at the time of the collect to the system info rendering output. 3. Improve report analysis rendering by displaying the full path to plugin and correlation files. 4. Adds string exclude support to the substring algorithm. This allows the generic string rearches like ERROR to be searched for and gathered while also allowing specific noise logs what are considered noise logs to be filtered out. 5. Create a separate SM errors substring plugin using the new exclude option. 6. Adds support for commented and empty lines in the plugins This allows for properly commented and formatted plugins. 7. Adds plugin label name error checking This allows esier debug of improperly coded plugins. 8. Fixed additional pep8 warnings. Test Plan: PASS: Verify on-system collect with --report option PASS: Verify on-system report generation PASS: Verify off-system report generation from git PASS: Verify system_info plugin collects info from all hosts PASS: Verify report displays system_info from active controller PASS: Verify handling when no active controller is detected PASS: Verify new sm_errors substring plugin with excludes PASS: Verify plugins can have empty or # commented lines PASS: Verify report tool plugins output include path to each plugin file PASS: Verify report tool correlations include path to each correlation file PASS: Verify report tool plugin label parsing error handling PASS: Verify all files pass pep8 without warning or error Story: 2010533 Task: 48072 Change-Id: I6d0253a4c3d8804a5e45b970d766e578ea69368f Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com> --- .../debian-scripts/report/correlator.py | 176 ++++++++------- .../debian-scripts/report/execution_engine.py | 213 ++++++++++++------ .../collector/debian-scripts/report/plugin.py | 13 +- .../report/plugin_algs/audit.py | 4 +- .../report/plugin_algs/daemon_failures.py | 4 +- .../report/plugin_algs/maintenance_errors.py | 4 +- .../report/plugin_algs/substring.py | 33 ++- .../report/plugin_algs/system_info.py | 117 +++++++--- .../report/plugins/maintenance_errors | 1 + .../debian-scripts/report/plugins/sm_errors | 14 ++ .../debian-scripts/report/plugins/substring | 2 +- .../collector/debian-scripts/report/report.py | 7 +- tools/collector/debian/deb_folder/rules | 9 +- 13 files changed, 393 insertions(+), 204 deletions(-) create mode 100755 tools/collector/debian-scripts/report/plugins/sm_errors diff --git a/tools/collector/debian-scripts/report/correlator.py b/tools/collector/debian-scripts/report/correlator.py index c6b94344..1b0b7c14 100755 --- a/tools/collector/debian-scripts/report/correlator.py +++ b/tools/collector/debian-scripts/report/correlator.py @@ -1,6 +1,6 @@ ######################################################################## # -# Copyright (c) 2022 Wind River Systems, Inc. +# Copyright (c) 2022 - 2023 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # @@ -109,8 +109,8 @@ class Correlator: ctrlr_link_down = re.findall( r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3} (.+) " "sm:", line)[0] - elif (re.search("Neighbor (.+) is now in the down", line) - and start_time and not ctrlr_down): + elif (re.search("Neighbor (.+) is now in the down", line) and + start_time and not ctrlr_down): ctrlr_down = re.findall( r"Neighbor \((.+)\) received event", line)[0] elif (re.search("Service (.+) is failed and has reached max " @@ -121,8 +121,8 @@ class Correlator: r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3} (.+) sm:", line)[0] elif (svc_failed and re.search( - "active-failed\\s+\\| disabling-failed\\s+\\| " - + svc_failed, line)): + "active-failed\\s+\\| disabling-failed\\s+\\| " + + svc_failed, line)): if re.search(r"\| go-active-failed\s+\|", line): go_active_failed = True else: @@ -140,40 +140,40 @@ class Correlator: start_time = start_time.strftime("%Y-%m-%dT%H:%M:%S") end_time = end_time.strftime("%Y-%m-%dT%H:%M:%S") if link_down: - data.append(start_time + " to " + end_time - + " Uncontrolled swact, refer to SM logs " + data.append(start_time + " to " + end_time + + " Uncontrolled swact, refer to SM logs " "for in-depth analysis, original active " "controller: " + ctrlr_link_down + "\n") elif ctrlr_down: if hb_loss: - data.append(start_time + " to " + end_time - + " Uncontrolled swact due to " + data.append(start_time + " to " + end_time + + " Uncontrolled swact due to " "spontaneous reset of active " "controller " + ctrlr_down + "\n") else: - data.append(start_time + " to " + end_time - + " Uncontrolled swact likely due to " + data.append(start_time + " to " + end_time + + " Uncontrolled swact likely due to " "spontaneous reset of active " "controller " + ctrlr_down + "\n") elif svc_failed: if active_failed and go_active_failed: - data.append(start_time + " to " + end_time - + " Uncontrolled swact due to service " + data.append(start_time + " to " + end_time + + " Uncontrolled swact due to service " "failure (" + svc_failed + ") twice " "in 2 minutes was unsuccessful so " "\"bounced back\" to original active " "controller " + ctrlr_svc_fail + "\n") elif active_failed: - data.append(start_time + " to " + end_time - + " Uncontrolled swact due to service " + data.append(start_time + " to " + end_time + + " Uncontrolled swact due to service " "failure (" + svc_failed + ") twice " - "in 2 minutes on active controller " - + ctrlr_svc_fail + "\n") + "in 2 minutes on active controller " + + ctrlr_svc_fail + "\n") else: - data.append(start_time + " to " + end_time - + " Uncontrolled swact likely due to " - "service failure (" + svc_failed - + ") twice in 2 minutes on active " + data.append(start_time + " to " + end_time + + " Uncontrolled swact likely due to " + "service failure (" + svc_failed + + ") twice in 2 minutes on active " "controller " + ctrlr_svc_fail + "\n") start_time = end_time = svc_failed = None @@ -218,19 +218,19 @@ class Correlator: host[0] + " auto recovery disabled", line)): old = data[-1].split("due", 1) if len(old) == 1: - data[-1] = (data[-1][:-1] - + " (auto recovery disabled)\n") + data[-1] = (data[-1][:-1] + + " (auto recovery disabled)\n") else: - data[-1] = (old[0] - + "(auto recovery disabled) due" - + old[1]) + data[-1] = (old[0] + + "(auto recovery disabled) due" + + old[1]) auto_recov_dis = True elif "GOENABLED Failed" in line and not goenable_start: goenable_start, auto_recov_dis = line[0:19], False goenable_host = re.findall( "Error : (.+) got GOENABLED Failed", line)[0] - elif ("configuration failed or incomplete" in line - and not config_start): + elif ("configuration failed or incomplete" in line and not + config_start): config_start = datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S") auto_recov_dis = False @@ -248,8 +248,8 @@ class Correlator: if (re.search(host + " (.+) Heartbeat Loss (.+) " "\\(during recovery soak\\)", line)): old = data[-1] - data[-1] = (old[0:23] + line[0:19] + old[42:-1] - + " (recovery over disabled due to " + data[-1] = (old[0:23] + line[0:19] + old[42:-1] + + " (recovery over disabled due to " "heartbeat soak failure)\n") else: hb_loss_start = line[0:19] @@ -257,15 +257,15 @@ class Correlator: hb_loss_host = re.findall("Error : (.+) [CM]", line)[0] # Check if previous failure recorded was heartbeat loss due to # missing heartbeat messages - elif ("regained MTCALIVE from host that has rebooted" in line - and data and re.search(r"Heartbeat loss failure (.+) " - r"\(recovery over disabled\)", - data[-1])): + elif ("regained MTCALIVE from host that rebooted" in line and + data and re.search( + r"Heartbeat loss failure (.+) " + r"\(recovery over disabled\)", data[-1])): host = re.findall("failure on (.+) due to", data[-1])[0] if re.search(host + " regained MTCALIVE", line): old = data[-1].split("due", 1)[0] - data[-1] = (old[0:23] + line[0:19] + old[42:] - + "due to uncontrolled reboot\n") + data[-1] = (old[0:23] + line[0:19] + old[42:] + + "due to uncontrolled reboot\n") elif (hb_loss_start and not comm_loss and hb_loss_host and re.search(hb_loss_host + " Loss Of Communication for 5 " "seconds", line)): @@ -282,14 +282,14 @@ class Correlator: "threshold reached", line)): goenable_end = line[0:19] if goenable_tst_f: - data.append(goenable_start + " to " + goenable_end - + " Go-enable test failure on " - + goenable_host + " due to failing of " - + goenable_tst_f + "\n") + data.append(goenable_start + " to " + goenable_end + + " Go-enable test failure on " + + goenable_host + " due to failing of " + + goenable_tst_f + "\n") else: - data.append(goenable_start + " to " + goenable_end - + " Go-enable test failure on " - + goenable_host + " due to unknown test " + data.append(goenable_start + " to " + goenable_end + + " Go-enable test failure on " + + goenable_host + " due to unknown test " "failing\n") goenable_start = goenable_end = goenable_host = None @@ -299,8 +299,8 @@ class Correlator: "threshold reached", line)): config_end = datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S") - if (config_tst_f - != "/etc/goenabled.d/config_goenabled_check.sh"): + if (config_tst_f != + "/etc/goenabled.d/config_goenabled_check.sh"): try: daemon_fail = self.search_daemon_fail( config_start, config_end, config_host) @@ -308,8 +308,8 @@ class Correlator: logger.error(e) if (config_tst_f == - "/etc/goenabled.d/config_goenabled_check.sh" - or daemon_fail): + "/etc/goenabled.d/config_goenabled_check.sh" or + daemon_fail): try: puppet_error = self.search_puppet_error( config_start, config_end) @@ -320,22 +320,22 @@ class Correlator: "%Y-%m-%dT%H:%M:%S") config_end = config_end.strftime("%Y-%m-%dT%H:%M:%S") if puppet_error: - data.append(config_start + " to " + config_end - + " Configuration failure on " - + config_host + " due to:\n" - + puppet_error) + data.append(config_start + " to " + config_end + + " Configuration failure on " + + config_host + " due to:\n" + + puppet_error) else: - data.append(config_start + " to " + config_end - + " Configuration failure on " - + config_host - + " due to unknown cause\n") + data.append(config_start + " to " + config_end + + " Configuration failure on " + + config_host + + " due to unknown cause\n") else: config_start = config_start.strftime( "%Y-%m-%dT%H:%M:%S") config_end = config_end.strftime("%Y-%m-%dT%H:%M:%S") - data.append(config_start + " to " + config_end - + " Possible configuration failure on " - + config_host + "\n") + data.append(config_start + " to " + config_end + + " Possible configuration failure on " + + config_host + "\n") config_start = config_end = config_host = None config_tst_f = puppet_error = None @@ -344,9 +344,9 @@ class Correlator: re.search(hb_loss_host + " Connectivity Recovered ", line)): hb_loss_end = line[0:19] - data.append(hb_loss_start + " to " + hb_loss_end - + " Heartbeat loss failure on " + hb_loss_host - + " due to too many missing heartbeat " + data.append(hb_loss_start + " to " + hb_loss_end + + " Heartbeat loss failure on " + hb_loss_host + + " due to too many missing heartbeat " "messages\n") hb_loss_start = hb_loss_end = hb_loss_host = None @@ -355,9 +355,9 @@ class Correlator: hb_loss_host and re.search( hb_loss_host + " Graceful Recovery Wait", line)): hb_loss_end = line[0:19] - data.append(hb_loss_start + " to " + hb_loss_end - + " Heartbeat loss failure on " + hb_loss_host - + " due to too many missing heartbeat " + data.append(hb_loss_start + " to " + hb_loss_end + + " Heartbeat loss failure on " + hb_loss_host + + " due to too many missing heartbeat " "messages (recovery over disabled)\n") hb_loss_start = hb_loss_end = hb_loss_host = None @@ -383,8 +383,8 @@ class Correlator: if (re.search("Error : " + host + " (.+) Heartbeat Loss ", line)): date = datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S") - if (date >= start_time - timedelta(minutes=1) - and date <= end_time): + if (date >= start_time - timedelta(minutes=1) and + date <= end_time): hb_loss = True break @@ -405,12 +405,12 @@ class Correlator: with open(file_path, "r") as daemon_failures: for line in daemon_failures: - if (re.search("\\d " + host - + " (.+) Failed to run the puppet manifest", + if (re.search("\\d " + host + + " (.+) Failed to run the puppet manifest", line)): date = datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S") - if (date >= start_time - timedelta(seconds=10) - and date <= end_time): + if (date >= start_time - timedelta(seconds=10) and + date <= end_time): daemon_fail = True break @@ -433,8 +433,8 @@ class Correlator: for line in puppet_errors: if "Error: " in line: date = datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S") - if (date >= start_time - timedelta(seconds=10) - and date <= end_time): + if (date >= start_time - timedelta(seconds=10) and + date <= end_time): puppet_log = line break @@ -460,13 +460,13 @@ class Correlator: if "force failed by SM" in line: host = re.findall("Error : (.+) is being", line)[0] if hostname == "all" or host == hostname: - data.append(line[0:19] + " " + host - + " force failed by SM\n") + data.append(line[0:19] + " " + host + + " force failed by SM\n") elif "Graceful Recovery Failed" in line: host = re.findall("Info : (.+) Task:", line)[0] if hostname == "all" or host == hostname: - data.append(line[0:19] + " " + host - + " graceful recovery failed\n") + data.append(line[0:19] + " " + host + + " graceful recovery failed\n") elif "MNFA ENTER" in line: mnfa_start = datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S") @@ -487,9 +487,9 @@ class Correlator: "%Y-%m-%dT%H:%M:%S") mnfa_duration -= mnfa_start mnfa_start = mnfa_start.strftime("%Y-%m-%dT%H:%M:%S") - data.append(mnfa_start + " Multi-node failure avoidance " - + "(duration: " + str(mnfa_duration) - + "; history:" + mnfa_hist + ")\n") + data.append(mnfa_start + " Multi-node failure avoidance " + + "(duration: " + str(mnfa_duration) + + "; history:" + mnfa_hist + ")\n") mnfa_start, mnfa_hist = None, "" @@ -506,9 +506,9 @@ class Correlator: svc_failed = re.findall( r"Service \((.+)\) is failed", line)[0] if hostname == "all" or host == hostname: - data.append(line[0:19] + " " + host - + " service failure (" + svc_failed - + ")\n") + data.append(line[0:19] + " " + host + + " service failure (" + svc_failed + + ")\n") return data @@ -524,7 +524,9 @@ class Correlator: # Open 'alarm' output file from alarm plugin and read it file_path = os.path.join(self.plugin_output_dir, "alarm") - + if not os.path.exists(file_path): + logger.debug("No alarms found") + return data with open(file_path, "r") as alarm: extract = False for line in alarm: @@ -547,8 +549,8 @@ class Correlator: temp = [] for entry in data: - temp.append(entry["name"] + " - set: " + str(entry["set"]) - + ", clear: " + str(entry["clear"]) + "\n") + temp.append(entry["name"] + " - set: " + str(entry["set"]) + + ", clear: " + str(entry["clear"]) + "\n") data = temp return data @@ -572,8 +574,8 @@ class Correlator: host = re.findall("Info : (.+) is ENABLED", line)[0] state = re.findall("is (.+)\n", line)[0].lower() if hostname == "all" or hostname in host: - data.append(line[0:19] + " " + host + " " + state - + "\n") + data.append(line[0:19] + " " + host + " " + + state + "\n") elif "locked-disabled" in line: host = re.findall( "Info : (.+) u?n?locked-disabled", line)[0] diff --git a/tools/collector/debian-scripts/report/execution_engine.py b/tools/collector/debian-scripts/report/execution_engine.py index 36372513..a1a45f55 100755 --- a/tools/collector/debian-scripts/report/execution_engine.py +++ b/tools/collector/debian-scripts/report/execution_engine.py @@ -1,6 +1,6 @@ ######################################################################## # -# Copyright (c) 2022 Wind River Systems, Inc. +# Copyright (c) 2022 - 2023 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # @@ -23,6 +23,7 @@ ######################################################################## import logging +import mmap import os import re import subprocess @@ -48,6 +49,10 @@ sys.dont_write_bytecode = True logger = logging.getLogger(__name__) +# regex expression used to get the hostname from the host dir name +# eg: chops '_20221201.213332' off of controller-0_20221201.213332 +regex_chop_bundle_date = r"_\d{8}\.\d{6}" + class ExecutionEngine: def __init__(self, opts, input_dir, output_dir): @@ -62,7 +67,9 @@ class ExecutionEngine: self.opts = opts self.hosts = {"controllers": {}, "workers": {}, "storages": {}} self.active_controller_directory = None + self.active_controller_hostname = None self.host_dirs = [] + self.hostnames = [] if not os.path.isdir(input_dir): logger.error("Error: Invalid input directory: %s", input_dir) @@ -89,37 +96,48 @@ class ExecutionEngine: except subprocess.CalledProcessError as e: logger.error(e) - # TODO: Need a better way to figure out the active controller - # Consider getting the system infop from all hosts. - # # Determine the active controller and load system info from it. for folder in (f.path for f in os.scandir(input_dir)): - logger.debug("folder: %s", os.path.basename(folder)) - # skip over the tarballs + logger.debug("base folder: %s", os.path.basename(folder)) + # skip over files (the tarballs) if not os.path.isdir(folder): continue + basename = os.path.basename(folder) + if basename == "report_analysis": + continue + + # Get the hostname from the host folder + hostname = re.sub(regex_chop_bundle_date, "", basename) + self.hostnames.append(hostname) + logger.debug("searching for active controller: %s" % hostname) host_dir = folder - extra_path = os.path.join(host_dir, "var", "extra") - database_path = os.path.join(host_dir, extra_path, "database") - host_info_path = os.path.join(host_dir, extra_path, "host.info") if os.path.isdir(host_dir): + extra_path = os.path.join(host_dir, "var", "extra") + # don't analyse a directory that doesn't contain # a 'var/extra' dir. if not os.path.exists(extra_path): + logger.warning("missing var/extra for %s" % hostname) continue + database_path = os.path.join(host_dir, extra_path, "database") + hostinfo_path = os.path.join(host_dir, extra_path, "host.info") + if os.path.exists(database_path): if os.listdir(database_path): + logger.info("Active Ctrl: %s" % hostname) self.active_controller_directory = folder + self.active_controller_hostname = hostname self.host_dirs.append(host_dir) logger.debug("Host Dirs: %s", self.host_dirs) - if os.path.exists(host_info_path): + # save host folder path based on nodetype + if os.path.exists(hostinfo_path): hostname, subfunction = self._extract_subfunction( - host_info_path) + hostinfo_path) if "controller" in subfunction: self.hosts["controllers"][hostname] = folder elif "worker" in subfunction: @@ -127,9 +145,8 @@ class ExecutionEngine: elif "storage" in subfunction: self.hosts["storages"][hostname] = folder - self.active_controller_directory = folder if not self.active_controller_directory: - raise ValueError("Active controller not found") + logger.error("Active Ctrl: NOT FOUND") def execute(self, plugins, output_dir): """Run a list of plugins @@ -178,6 +195,7 @@ class ExecutionEngine: os.path.join(folderpath, file) for file in plugin.state["files"] ], + plugin.state["exclude"], ) # creating output file @@ -186,37 +204,56 @@ class ExecutionEngine: f"substring_{hostname}", ) if self.opts.verbose: - logger.info("... output at " - + os.path.abspath(output_file)) - with open(output_file, "w") as file: - file.write( - f"Date range: {self.opts.start} until " - f"{self.opts.end}\n" - ) - file.write( - f"substrings: " - f"{' '.join(plugin.state['substring'])}\n" - ) - for line in events: - if line[-1] == "\n": - file.write(line) - else: - file.write(line + "\n") + logger.info("... output at " + + os.path.abspath(output_file)) + if events: + with open(output_file, "w") as file: + file.write( + f"Date range: {self.opts.start} until " + f"{self.opts.end}\n" + ) + file.write( + f"substrings: " + f"{' '.join(plugin.state['substring'])}\n" + ) + for line in events: + if line[-1] == "\n": + file.write(line) + else: + file.write(line + "\n") else: if plugin.state["algorithm"] == algorithms.SYSTEM_INFO: - for host_dir in self.host_dirs: - info = system_info(host_dir) - system_info_output = os.path.join(plugin_output_dir, - "system_info") - with open(system_info_output, "w") as file: - for i in info: - file.write(i + "\n") - for k, v in self.hosts.items(): - file.write(f"{k}: {','.join(v.keys())}\n") + # Get system info of the active controller first + # and then put the system info of each host in the + # system info output folder. + system_info_output = os.path.join(plugin_output_dir, + "system_info") + if os.path.exists(system_info_output): + os.remove(system_info_output) - if self.opts.verbose: - logger.info(processing + ", output at " + - os.path.abspath(system_info_output)) + hostname = None + host_dir = None + if self.active_controller_directory is None: + hostname = re.sub(regex_chop_bundle_date, "", + os.path.basename(self.host_dirs[0])) + host_dir = self.host_dirs[0] + else: + hostname = self.active_controller_hostname + host_dir = self.active_controller_directory + + system_info(hostname, host_dir, + system_info_output, + self.hosts, True) + + for host_dir in self.host_dirs: + if host_dir != self.active_controller_directory: + hostname = re.sub(regex_chop_bundle_date, "", + os.path.basename(host_dir)) + system_info(hostname, + host_dir, + system_info_output, + None, + False) elif plugin.state["algorithm"] == algorithms.AUDIT: hosts = {} @@ -284,7 +321,6 @@ class ExecutionEngine: file.write(f"{k}:\n") for date in v["dates"]: file.write(f" {date}\n") - # creating output log file with open(log_output, "w") as file: for k, v in logs.items(): @@ -312,14 +348,16 @@ class ExecutionEngine: self._create_output_file( "maintenance_errors", plugin_output_dir, maintenance_errors(self.hosts, self.opts.start, - self.opts.end), + self.opts.end, + plugin.state["exclude"]), processing ) elif plugin.state["algorithm"] == algorithms.DAEMON_FAILURES: self._create_output_file( "daemon_failures", plugin_output_dir, daemon_failures(self.hosts, self.opts.start, - self.opts.end), + self.opts.end, + plugin.state["exclude"]), processing ) elif plugin.state["algorithm"] == algorithms.STATE_CHANGES: @@ -330,6 +368,35 @@ class ExecutionEngine: processing ) + # Dump a summary of data found by the plugins + if os.path.exists(plugin_output_dir): + + # Print a summary of the logs/data gathers by the plugins + empty_files = "" + logger.info("Plugin Results:\n") + for fn in os.listdir(plugin_output_dir): + filename = os.path.join(plugin_output_dir, fn) + with open(filename, "r+") as f: + # Show how much data is in each plugins output file + if os.path.isfile(filename) and os.path.getsize(filename): + buf = mmap.mmap(f.fileno(), 0) + entries = 0 + readline = buf.readline + while readline(): + entries += 1 + if fn == "system_info": + logger.info(filename) + else: + logger.info("%s has %d entries" % + (filename, entries)) + else: + empty_files += fn + " " + if empty_files: + logger.info("\n... nothing found by plugins: %s" % empty_files) + else: + logger.error("Plugin output dir missing: %s" % plugin_output_dir) + sys.exit("... exiting") + # Running the correlator and printing the output from it self.run_correlator(output_dir, plugin_output_dir) @@ -357,36 +424,46 @@ class ExecutionEngine: failures.append("\nTotal failures found: " + str(failures_len) + "\n") events.append("\nTotal events found: " + str(events_len) + "\n") alarms.append("\nTotal alarms found: " + str(alarms_len) + "\n") - state_changes.append("\nTotal state changes found: " - + str(state_changes_len) + "\n") + state_changes.append("\nTotal state changes found: " + + str(state_changes_len) + "\n") - # TODO: Put at the end of the report - logger.info("\nRunning correlator... view report at " - + output_dir) - self._create_output_file("correlator_failures", output_dir, + logger.info("\nCorrelated Results:\n") + self._create_output_file("failures", output_dir, failures, "") - self._create_output_file("correlator_events", output_dir, + self._create_output_file("events", output_dir, events, "") - self._create_output_file("correlator_alarms", output_dir, + self._create_output_file("alarms", output_dir, alarms, "") - self._create_output_file("correlator_state_changes", output_dir, + self._create_output_file("state_changes", output_dir, state_changes, "") + max = 0 + for sl in [events_len, alarms_len, state_changes_len, failures_len]: + if len(str(sl)) > max: + max = len(str(sl)) if not self.opts.verbose: - logger.info("Events : " + str(events_len)) - logger.info("Alarms : " + str(alarms_len)) - logger.info("State Changes: " + str(state_changes_len)) - logger.info("Failures : " + str(failures_len)) + logger.info("Events : " + str(events_len) + + " " * (max - len(str(events_len))) + + " " + output_dir + "/events") + logger.info("Alarms : " + str(alarms_len) + + " " * (max - len(str(alarms_len))) + + " " + output_dir + "/alarms") + logger.info("State Changes: " + str(state_changes_len) + + " " * (max - len(str(state_changes_len))) + + " " + output_dir + "/state_changes") + logger.info("Failures : " + str(failures_len) + + " " * (max - len(str(failures_len))) + + " " + output_dir + "/failures") for f in failures[:-1]: if "Uncontrolled swact" in f: - logger.info(f[0:19] + " " - + re.findall("active controller:? (.+)\n", - f)[0] + " uncontrolled swact") + logger.info(f[0:19] + " " + + re.findall("active controller:? (.+)\n", + f)[0] + " uncontrolled swact") elif "failure on" in f: host = re.findall(r"failure on ([^\s]+) ", f)[0] - logger.info(f[0:19] + " " + host + " " - + re.findall("^(.+) failure on ", - f[43:])[0].lower() + " failure") + logger.info(f[0:19] + " " + host + " " + + re.findall("^(.+) failure on ", + f[43:])[0].lower() + " failure") else: logger.info(f[:-1]) else: @@ -405,9 +482,9 @@ class ExecutionEngine: logger.info(k + ": " + str(v) + " time(s)") logger.info("\nAlarms: " + str(alarms_len)) - logger.info("The full list of alarms can be found at " - + os.path.abspath(output_dir) - + "/correlator_alarms") + logger.info("The full list of alarms can be found at " + + os.path.abspath(output_dir) + + "/alarms") # Dictionary to keep track of number of times state changes # happens on each host @@ -451,8 +528,8 @@ class ExecutionEngine: else: file.write(i + "\n") if self.opts.verbose: - output = ("... output at " - + os.path.abspath(os.path.join(directory, filename))) + output = ("... output at " + + os.path.abspath(os.path.join(directory, filename))) if processing == "": logger.info(output) else: diff --git a/tools/collector/debian-scripts/report/plugin.py b/tools/collector/debian-scripts/report/plugin.py index 3268e976..97797f49 100755 --- a/tools/collector/debian-scripts/report/plugin.py +++ b/tools/collector/debian-scripts/report/plugin.py @@ -1,6 +1,6 @@ ######################################################################## # -# Copyright (c) 2022 Wind River Systems, Inc. +# Copyright (c) 2022 - 2023 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # @@ -39,6 +39,7 @@ class Plugin: "files": [], "hosts": [], "substring": [], + "exclude": [], "alarm_exclude": [], "entity_exclude": [], "start": None, @@ -78,6 +79,10 @@ class Plugin: line (string): Line from plugin file to extract """ + # allow plugins to have empty lines or comments starting with # + if len(line) <= 1 or line[0] == '#': + return + # split string from first '=', left side is label right side is value data = line.strip().split("=", 1) if len(data) <= 1: @@ -85,11 +90,17 @@ class Plugin: label = data[0] value = data[1] label = label.replace(" ", "") + + # ignore labels that don't start with an alphabetical char + if label[0].isalpha() is False: + raise ValueError("Invalid label value") try: if label == "algorithm": self.state["algorithm"] = value.replace(" ", "") elif label == "substring": self.state["substring"].append(data[1]) + elif label == "exclude": + self.state["exclude"].append(data[1]) elif label == "hosts": self.state["hosts"] = value.replace(" ", "").split(",") elif label == "alarm_exclude": diff --git a/tools/collector/debian-scripts/report/plugin_algs/audit.py b/tools/collector/debian-scripts/report/plugin_algs/audit.py index d2c93fe1..16223244 100644 --- a/tools/collector/debian-scripts/report/plugin_algs/audit.py +++ b/tools/collector/debian-scripts/report/plugin_algs/audit.py @@ -46,8 +46,8 @@ def audit(start, end, audit_log_path): # Counts sum of audits from all subclouds ] INDEX_MIDDLE_WORD = 1 - data = [("These rates and totals represent the sum of audits from " - + "all subclouds")] + data = [("These rates and totals represent the sum of audits " + + "from all subclouds")] def command(text): diff --git a/tools/collector/debian-scripts/report/plugin_algs/daemon_failures.py b/tools/collector/debian-scripts/report/plugin_algs/daemon_failures.py index dd879bb1..65979b45 100644 --- a/tools/collector/debian-scripts/report/plugin_algs/daemon_failures.py +++ b/tools/collector/debian-scripts/report/plugin_algs/daemon_failures.py @@ -19,7 +19,7 @@ import os from plugin_algs.substring import substring -def daemon_failures(hosts, start, end): +def daemon_failures(hosts, start, end, exclude_list=None): """Daemon failures algorithm Presents all "Failed to run the puppet manifest" log messages in the system @@ -37,6 +37,6 @@ def daemon_failures(hosts, start, end): daemon_files.append(daemon_path) daemon_substrings = ["Failed to run the puppet manifest"] - data = substring(start, end, daemon_substrings, daemon_files) + data = substring(start, end, daemon_substrings, daemon_files, exclude_list) return sorted(data) diff --git a/tools/collector/debian-scripts/report/plugin_algs/maintenance_errors.py b/tools/collector/debian-scripts/report/plugin_algs/maintenance_errors.py index c9c8b19a..41a745e8 100644 --- a/tools/collector/debian-scripts/report/plugin_algs/maintenance_errors.py +++ b/tools/collector/debian-scripts/report/plugin_algs/maintenance_errors.py @@ -19,7 +19,7 @@ import os from plugin_algs.substring import substring -def maintenance_errors(hosts, start, end): +def maintenance_errors(hosts, start, end, exclude_list=None): """Maintenance errors algorithm Presents maintenance errors and other relevant log messages in system, such as "Configuration failure" @@ -51,6 +51,6 @@ def maintenance_errors(hosts, start, end): "auto recovery disabled", "Graceful Recovery Failed", "MNFA ENTER", "MNFA EXIT", "MNFA POOL"] - data = substring(start, end, mtc_substrings, mtc_files) + data = substring(start, end, mtc_substrings, mtc_files, exclude_list) return sorted(data) diff --git a/tools/collector/debian-scripts/report/plugin_algs/substring.py b/tools/collector/debian-scripts/report/plugin_algs/substring.py index 7667c091..d7c2fb71 100644 --- a/tools/collector/debian-scripts/report/plugin_algs/substring.py +++ b/tools/collector/debian-scripts/report/plugin_algs/substring.py @@ -1,6 +1,6 @@ ######################################################################## # -# Copyright (c) 2022 Wind River Systems, Inc. +# Copyright (c) 2022 - 2023 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # @@ -24,7 +24,7 @@ import subprocess logger = logging.getLogger(__name__) -def substring(start, end, substr, files): +def substring(start, end, substr, files, exclude_list=None): """Substring algorithm Looks for all substrings in substr within files @@ -33,7 +33,7 @@ def substring(start, end, substr, files): end (string): End time for analysis substr (string list): List of substrings to look for files (string list): List of absolute filepaths to search in - + exclude_list (string list): list of strings to exclude from report Errors: FileNotFoundError """ @@ -49,15 +49,17 @@ def substring(start, end, substr, files): if (re.search("controller-1_(.+)/var/log/mtcAgent.log", file)): continue - raise FileNotFoundError(f"File not found: {file}") + else: + data.append("File not found: " + file) + continue cont = True # Searching through file command = (f"""grep -Ea "{'|'.join(s for s in substr)}" """ f"""{file} 2>/dev/null""") status = _continue(start, end, file) - if (status == CONTINUE_CURRENT - or status == CONTINUE_CURRENT_OLD): + if (status == CONTINUE_CURRENT or + status == CONTINUE_CURRENT_OLD): # continue with current file if status == CONTINUE_CURRENT: cont = False @@ -70,8 +72,8 @@ def substring(start, end, substr, files): f"""{file}.{n} 2>/dev/null""") status = _continue(start, end, f"{file}.{n}") - if (status == CONTINUE_CURRENT - or status == CONTINUE_CURRENT_OLD): + if (status == CONTINUE_CURRENT or + status == CONTINUE_CURRENT_OLD): if status == CONTINUE_CURRENT: cont = False _evaluate_substring(start, end, data, command) @@ -85,8 +87,8 @@ def substring(start, end, substr, files): status = _continue(start, end, f"{file}.{n}.gz", compressed=True) - if (status == CONTINUE_CURRENT - or status == CONTINUE_CURRENT_OLD): + if (status == CONTINUE_CURRENT or + status == CONTINUE_CURRENT_OLD): if status == CONTINUE_CURRENT: cont = False _evaluate_substring(start, end, data, command) @@ -97,6 +99,17 @@ def substring(start, end, substr, files): logger.error(e) continue + # now remove any logs that contain substrings in the exclude_list + if exclude_list: + filtered_data = [] + for e in data: + found = False + for exclude in exclude_list: + if e.find(exclude) != -1: + found = True + if found is False: + filtered_data.append(e) + return sorted(filtered_data) return sorted(data) diff --git a/tools/collector/debian-scripts/report/plugin_algs/system_info.py b/tools/collector/debian-scripts/report/plugin_algs/system_info.py index ffa50773..ff1494d5 100644 --- a/tools/collector/debian-scripts/report/plugin_algs/system_info.py +++ b/tools/collector/debian-scripts/report/plugin_algs/system_info.py @@ -1,6 +1,6 @@ ######################################################################## # -# Copyright (c) 2022 Wind River Systems, Inc. +# Copyright (c) 2022 -2023 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # @@ -17,47 +17,112 @@ import os import re -def system_info(host_dir): +def system_info(hostname, host_dir, output_dir, hosts, loud=False): """System info algorithm Presents basic information about the system, such as the build type Parameters: - host_dir (string): path to the collect host dir + hostname (string): make of the host + host_dir (string): path to the collect host dir + output_dir (string): path to the file to store the system info + hosts (string): list of host objects + loud (boolean): when True print system info to stdout + + Returns: nothing """ data = [] - with open( - os.path.join(host_dir, "etc", "platform", "platform.conf") - ) as file: + + if host_dir is None: + raise ValueError("system_info:No specified host dir") + + # from /etc/platform/platform.conf + platform_conf = os.path.join(host_dir, "etc", "platform", "platform.conf") + + # ... load the following items first + with open(platform_conf) as file: for line in file: if "system_mode" in line: - data.append( - f"System Mode: " - f"{re.match('^system_mode=(.*)', line).group(1)}" - ) + val = re.match('^system_mode=(.*)', line).group(1) + data.append(f"System Mode: {val}") elif "system_type" in line: - data.append( - f"System Type: " - f"{re.match('^system_type=(.*)', line).group(1)}" - ) + val = re.match('^system_type=(.*)', line).group(1) + data.append(f"System Type: {val}") elif "distributed_cloud_role" in line: role = re.match('^distributed_cloud_role=(.*)', line).group(1) - data.append(f"Distributed cloud role: {role}") + data.append(f"DC Role : {role}") elif "sw_version" in line: - data.append( - f"SW Version: " - f"{re.match('^sw_version=(.*)', line).group(1)}" - ) + val = re.match('^sw_version=(.*)', line).group(1) + data.append(f"S/W Version: {val}") + # ... followed by these items + with open(platform_conf) as file: + for line in file: + if "nodetype" in line: + val = re.match('^nodetype=(.*)', line).group(1) + data.append(f"Node Type : {val}") + elif "subfunction" in line: + val = re.match('^subfunction=(.*)', line).group(1) + data.append(f"subfunction: {val}") + elif "oam_interface" in line: + val = re.match('^oam_interface=(.*)', line).group(1) + data.append(f"OAM Iface : {val}") + elif "management_interface" in line: + val = re.match('^management_interface=(.*)', line).group(1) + data.append(f"Mgmt Iface : {val}") + elif "cluster_host_interface" in line: + val = re.match('^cluster_host_interface=(.*)', line).group(1) + data.append(f"Clstr Iface: {val}") + + # /etc/os-release info + with open( + os.path.join(host_dir, "etc", "os-release") + ) as file: + for line in file: + if "PRETTY_NAME" in line: + val = (re.match('^PRETTY_NAME=(.*)', line).group(1)) + val = val.strip('\"') + data.append(f"OS Release : {val}") + + # /etc/build.info with open( os.path.join(host_dir, "etc", "build.info") ) as file: for line in file: if "BUILD_TYPE" in line: - data.append( - f"Build Type: " - f"{re.match('^BUILD_TYPE=(.*)', line).group(1)}" - ) - elif re.match("^OS=(.*)", line): - data.append(f"OS: {re.match('^OS=(.*)', line).group(1)}") + val = (re.match('^BUILD_TYPE=(.*)', line).group(1)) + val = val.strip('\"') + data.append(f"Build Type : {val}") + elif "BUILD_DATE" in line: + val = (re.match('^BUILD_DATE=(.*)', line).group(1)) + val = val.strip('\"') + data.append(f"Build Date : {val}") + elif "BUILD_DIR" in line: + val = (re.match('^BUILD_DIR=(.*)', line).group(1)) + val = val.strip('\"') + data.append(f"Build Dir : {val}") - return data + with open(output_dir, "a") as file: + dashs = "-" * len(hostname) + file.write("\n" + hostname + "\n" + dashs + "\n") + for i in data: + file.write(i + "\n") + if loud is True: + print(i) + + if hosts is not None: + for k, v in hosts.items(): + if not len(v.keys()): + continue + if k == "storages": + k += " " + if k == "workers": + k += " " + file.write(f"{k}: {','.join(v.keys())}\n") + if loud is True: + print(f"{k}: {','.join(v.keys())}") + + # create an empty line following the system info dump + if loud is True: + print("") + + return diff --git a/tools/collector/debian-scripts/report/plugins/maintenance_errors b/tools/collector/debian-scripts/report/plugins/maintenance_errors index 1f5bc144..92ebc9b9 100755 --- a/tools/collector/debian-scripts/report/plugins/maintenance_errors +++ b/tools/collector/debian-scripts/report/plugins/maintenance_errors @@ -1 +1,2 @@ algorithm=maintenance_errors +exclude=task clear diff --git a/tools/collector/debian-scripts/report/plugins/sm_errors b/tools/collector/debian-scripts/report/plugins/sm_errors new file mode 100755 index 00000000..fc0b6b89 --- /dev/null +++ b/tools/collector/debian-scripts/report/plugins/sm_errors @@ -0,0 +1,14 @@ +algorithm=substring +files=var/log/sm.log +hosts=controllers + +# logs to exclude +substring=ERROR: sm +exclude=Failed to set alarm +exclude=Failed to set log +exclude=Failed to clear alarm +exclude=Failed to get all alarms +exclude=Failed to query service based on pid +exclude=Failed to look up interface name +exclude=Failed to stop service heartbeat thread +exclude=Heartbeat is not required diff --git a/tools/collector/debian-scripts/report/plugins/substring b/tools/collector/debian-scripts/report/plugins/substring index bae11679..25c03ba3 100755 --- a/tools/collector/debian-scripts/report/plugins/substring +++ b/tools/collector/debian-scripts/report/plugins/substring @@ -1,5 +1,5 @@ algorithm=substring -files=var/log/mtcAgent.log, var/log/sm.log +files=var/log/mtcAgent.log hosts=controllers substring=operation failed substring=Failed to send message diff --git a/tools/collector/debian-scripts/report/report.py b/tools/collector/debian-scripts/report/report.py index 3411ff71..9c7bbe43 100755 --- a/tools/collector/debian-scripts/report/report.py +++ b/tools/collector/debian-scripts/report/report.py @@ -446,6 +446,7 @@ bundle_names = [] bundles = [] ignore_list = [analysis_folder_name] ignore_list += ["apps", "horizon", "lighttpd", "lost+found", "sysinv-tmpdir"] +ignore_list += ["patch-api-proxy-tmpdir", "platform-api-proxy-tmpdir"] with open(os.path.join(output_dir, "untar.log"), "a") as logfile: for obj in (os.scandir(input_dir)): @@ -463,6 +464,9 @@ with open(os.path.join(output_dir, "untar.log"), "a") as logfile: date_time = obj.name[-15:] if args.debug: logger.debug("Found Dir : %s : %s", obj.name, date_time) + elif os.path.islink(obj.path): + # ignore sym links + continue else: if not tarfile.is_tarfile(obj.path): continue @@ -559,6 +563,7 @@ elif args.debug: # create the output directory ; report_analysis output_dir = os.path.join(path_file, analysis_folder_name) +print("\nReport: %s\n" % output_dir) if not os.path.exists(output_dir): os.makedirs(output_dir, exist_ok=True) @@ -567,7 +572,7 @@ try: engine = ExecutionEngine(args, path_file, output_dir) except ValueError as e: logger.error(str(e)) - sys.exit("Confirm you are running the report tool on a collect bundle") + logger.error("Confirm you are running the report tool on a collect bundle") if args.algorithm: plugins.append(Plugin(opts=vars(args))) diff --git a/tools/collector/debian/deb_folder/rules b/tools/collector/debian/deb_folder/rules index e99cc98b..a06aec28 100755 --- a/tools/collector/debian/deb_folder/rules +++ b/tools/collector/debian/deb_folder/rules @@ -30,7 +30,7 @@ override_dh_auto_install: install -m 755 -p expect_done $(ROOT)/usr/local/sbin/expect_done install -m 755 -p mariadb-cli.sh $(ROOT)/usr/local/sbin/mariadb-cli - # Report Tool + # Report Tool install -m 755 -p report/report.py $(ROOT)/usr/local/bin/report/report.py install -m 755 -p report/execution_engine.py $(ROOT)/usr/local/bin/report/execution_engine.py install -m 755 -p report/algorithms.py $(ROOT)/usr/local/bin/report/algorithms.py @@ -38,7 +38,7 @@ override_dh_auto_install: install -m 755 -p report/correlator.py $(ROOT)/usr/local/bin/report/correlator.py install -m 755 -p report/README $(ROOT)/usr/local/bin/report/README - # Report Tool Plugin Algorithms + # Report Tool Plugin Algorithms install -m 755 -p report/plugin_algs/alarm.py $(ROOT)/usr/local/bin/report/plugin_algs/alarm.py install -m 755 -p report/plugin_algs/audit.py $(ROOT)/usr/local/bin/report/plugin_algs/audit.py install -m 755 -p report/plugin_algs/daemon_failures.py $(ROOT)/usr/local/bin/report/plugin_algs/daemon_failures.py @@ -51,20 +51,21 @@ override_dh_auto_install: install -m 755 -p report/plugin_algs/swact_activity.py $(ROOT)/usr/local/bin/report/plugin_algs/swact_activity.py install -m 755 -p report/plugin_algs/system_info.py $(ROOT)/usr/local/bin/report/plugin_algs/system_info.py - # Report Tool Plugins + # Report Tool Plugins install -m 755 -p report/plugins/alarm $(ROOT)/usr/local/bin/report/plugins/alarm install -m 755 -p report/plugins/daemon_failures $(ROOT)/usr/local/bin/report/plugins/daemon_failures install -m 755 -p report/plugins/heartbeat_loss $(ROOT)/usr/local/bin/report/plugins/heartbeat_loss install -m 755 -p report/plugins/maintenance_errors $(ROOT)/usr/local/bin/report/plugins/maintenance_errors install -m 755 -p report/plugins/process_failures $(ROOT)/usr/local/bin/report/plugins/process_failures install -m 755 -p report/plugins/puppet_errors $(ROOT)/usr/local/bin/report/plugins/puppet_errors + install -m 755 -p report/plugins/sm_errors $(ROOT)/usr/local/bin/report/plugins/sm_errors install -m 755 -p report/plugins/state_changes $(ROOT)/usr/local/bin/report/plugins/state_changes install -m 755 -p report/plugins/substring $(ROOT)/usr/local/bin/report/plugins/substring install -m 755 -p report/plugins/swact_activity $(ROOT)/usr/local/bin/report/plugins/swact_activity install -m 755 -p report/plugins/system_info $(ROOT)/usr/local/bin/report/plugins/system_info install -m 755 -p report/plugins/substring_hosts $(SYSCONFDIR)/collect/plugins/substring_hosts - # Collect Plugins + # Collect Plugins install -m 755 -p collect_sysinv.sh $(SYSCONFDIR)/collect.d/collect_sysinv install -m 755 -p collect_psqldb.sh $(SYSCONFDIR)/collect.d/collect_psqldb install -m 755 -p collect_mariadb.sh $(SYSCONFDIR)/collect.d/collect_mariadb