diff --git a/tools/collector/debian-scripts/report/correlator.py b/tools/collector/debian-scripts/report/correlator.py index c6b94344..1b0b7c14 100755 --- a/tools/collector/debian-scripts/report/correlator.py +++ b/tools/collector/debian-scripts/report/correlator.py @@ -1,6 +1,6 @@ ######################################################################## # -# Copyright (c) 2022 Wind River Systems, Inc. +# Copyright (c) 2022 - 2023 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # @@ -109,8 +109,8 @@ class Correlator: ctrlr_link_down = re.findall( r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3} (.+) " "sm:", line)[0] - elif (re.search("Neighbor (.+) is now in the down", line) - and start_time and not ctrlr_down): + elif (re.search("Neighbor (.+) is now in the down", line) and + start_time and not ctrlr_down): ctrlr_down = re.findall( r"Neighbor \((.+)\) received event", line)[0] elif (re.search("Service (.+) is failed and has reached max " @@ -121,8 +121,8 @@ class Correlator: r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3} (.+) sm:", line)[0] elif (svc_failed and re.search( - "active-failed\\s+\\| disabling-failed\\s+\\| " - + svc_failed, line)): + "active-failed\\s+\\| disabling-failed\\s+\\| " + + svc_failed, line)): if re.search(r"\| go-active-failed\s+\|", line): go_active_failed = True else: @@ -140,40 +140,40 @@ class Correlator: start_time = start_time.strftime("%Y-%m-%dT%H:%M:%S") end_time = end_time.strftime("%Y-%m-%dT%H:%M:%S") if link_down: - data.append(start_time + " to " + end_time - + " Uncontrolled swact, refer to SM logs " + data.append(start_time + " to " + end_time + + " Uncontrolled swact, refer to SM logs " "for in-depth analysis, original active " "controller: " + ctrlr_link_down + "\n") elif ctrlr_down: if hb_loss: - data.append(start_time + " to " + end_time - + " Uncontrolled swact due to " + data.append(start_time + " to " + end_time + + " Uncontrolled swact due to " "spontaneous reset of active " "controller " + ctrlr_down + "\n") else: - data.append(start_time + " to " + end_time - + " Uncontrolled swact likely due to " + data.append(start_time + " to " + end_time + + " Uncontrolled swact likely due to " "spontaneous reset of active " "controller " + ctrlr_down + "\n") elif svc_failed: if active_failed and go_active_failed: - data.append(start_time + " to " + end_time - + " Uncontrolled swact due to service " + data.append(start_time + " to " + end_time + + " Uncontrolled swact due to service " "failure (" + svc_failed + ") twice " "in 2 minutes was unsuccessful so " "\"bounced back\" to original active " "controller " + ctrlr_svc_fail + "\n") elif active_failed: - data.append(start_time + " to " + end_time - + " Uncontrolled swact due to service " + data.append(start_time + " to " + end_time + + " Uncontrolled swact due to service " "failure (" + svc_failed + ") twice " - "in 2 minutes on active controller " - + ctrlr_svc_fail + "\n") + "in 2 minutes on active controller " + + ctrlr_svc_fail + "\n") else: - data.append(start_time + " to " + end_time - + " Uncontrolled swact likely due to " - "service failure (" + svc_failed - + ") twice in 2 minutes on active " + data.append(start_time + " to " + end_time + + " Uncontrolled swact likely due to " + "service failure (" + svc_failed + + ") twice in 2 minutes on active " "controller " + ctrlr_svc_fail + "\n") start_time = end_time = svc_failed = None @@ -218,19 +218,19 @@ class Correlator: host[0] + " auto recovery disabled", line)): old = data[-1].split("due", 1) if len(old) == 1: - data[-1] = (data[-1][:-1] - + " (auto recovery disabled)\n") + data[-1] = (data[-1][:-1] + + " (auto recovery disabled)\n") else: - data[-1] = (old[0] - + "(auto recovery disabled) due" - + old[1]) + data[-1] = (old[0] + + "(auto recovery disabled) due" + + old[1]) auto_recov_dis = True elif "GOENABLED Failed" in line and not goenable_start: goenable_start, auto_recov_dis = line[0:19], False goenable_host = re.findall( "Error : (.+) got GOENABLED Failed", line)[0] - elif ("configuration failed or incomplete" in line - and not config_start): + elif ("configuration failed or incomplete" in line and not + config_start): config_start = datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S") auto_recov_dis = False @@ -248,8 +248,8 @@ class Correlator: if (re.search(host + " (.+) Heartbeat Loss (.+) " "\\(during recovery soak\\)", line)): old = data[-1] - data[-1] = (old[0:23] + line[0:19] + old[42:-1] - + " (recovery over disabled due to " + data[-1] = (old[0:23] + line[0:19] + old[42:-1] + + " (recovery over disabled due to " "heartbeat soak failure)\n") else: hb_loss_start = line[0:19] @@ -257,15 +257,15 @@ class Correlator: hb_loss_host = re.findall("Error : (.+) [CM]", line)[0] # Check if previous failure recorded was heartbeat loss due to # missing heartbeat messages - elif ("regained MTCALIVE from host that has rebooted" in line - and data and re.search(r"Heartbeat loss failure (.+) " - r"\(recovery over disabled\)", - data[-1])): + elif ("regained MTCALIVE from host that rebooted" in line and + data and re.search( + r"Heartbeat loss failure (.+) " + r"\(recovery over disabled\)", data[-1])): host = re.findall("failure on (.+) due to", data[-1])[0] if re.search(host + " regained MTCALIVE", line): old = data[-1].split("due", 1)[0] - data[-1] = (old[0:23] + line[0:19] + old[42:] - + "due to uncontrolled reboot\n") + data[-1] = (old[0:23] + line[0:19] + old[42:] + + "due to uncontrolled reboot\n") elif (hb_loss_start and not comm_loss and hb_loss_host and re.search(hb_loss_host + " Loss Of Communication for 5 " "seconds", line)): @@ -282,14 +282,14 @@ class Correlator: "threshold reached", line)): goenable_end = line[0:19] if goenable_tst_f: - data.append(goenable_start + " to " + goenable_end - + " Go-enable test failure on " - + goenable_host + " due to failing of " - + goenable_tst_f + "\n") + data.append(goenable_start + " to " + goenable_end + + " Go-enable test failure on " + + goenable_host + " due to failing of " + + goenable_tst_f + "\n") else: - data.append(goenable_start + " to " + goenable_end - + " Go-enable test failure on " - + goenable_host + " due to unknown test " + data.append(goenable_start + " to " + goenable_end + + " Go-enable test failure on " + + goenable_host + " due to unknown test " "failing\n") goenable_start = goenable_end = goenable_host = None @@ -299,8 +299,8 @@ class Correlator: "threshold reached", line)): config_end = datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S") - if (config_tst_f - != "/etc/goenabled.d/config_goenabled_check.sh"): + if (config_tst_f != + "/etc/goenabled.d/config_goenabled_check.sh"): try: daemon_fail = self.search_daemon_fail( config_start, config_end, config_host) @@ -308,8 +308,8 @@ class Correlator: logger.error(e) if (config_tst_f == - "/etc/goenabled.d/config_goenabled_check.sh" - or daemon_fail): + "/etc/goenabled.d/config_goenabled_check.sh" or + daemon_fail): try: puppet_error = self.search_puppet_error( config_start, config_end) @@ -320,22 +320,22 @@ class Correlator: "%Y-%m-%dT%H:%M:%S") config_end = config_end.strftime("%Y-%m-%dT%H:%M:%S") if puppet_error: - data.append(config_start + " to " + config_end - + " Configuration failure on " - + config_host + " due to:\n" - + puppet_error) + data.append(config_start + " to " + config_end + + " Configuration failure on " + + config_host + " due to:\n" + + puppet_error) else: - data.append(config_start + " to " + config_end - + " Configuration failure on " - + config_host - + " due to unknown cause\n") + data.append(config_start + " to " + config_end + + " Configuration failure on " + + config_host + + " due to unknown cause\n") else: config_start = config_start.strftime( "%Y-%m-%dT%H:%M:%S") config_end = config_end.strftime("%Y-%m-%dT%H:%M:%S") - data.append(config_start + " to " + config_end - + " Possible configuration failure on " - + config_host + "\n") + data.append(config_start + " to " + config_end + + " Possible configuration failure on " + + config_host + "\n") config_start = config_end = config_host = None config_tst_f = puppet_error = None @@ -344,9 +344,9 @@ class Correlator: re.search(hb_loss_host + " Connectivity Recovered ", line)): hb_loss_end = line[0:19] - data.append(hb_loss_start + " to " + hb_loss_end - + " Heartbeat loss failure on " + hb_loss_host - + " due to too many missing heartbeat " + data.append(hb_loss_start + " to " + hb_loss_end + + " Heartbeat loss failure on " + hb_loss_host + + " due to too many missing heartbeat " "messages\n") hb_loss_start = hb_loss_end = hb_loss_host = None @@ -355,9 +355,9 @@ class Correlator: hb_loss_host and re.search( hb_loss_host + " Graceful Recovery Wait", line)): hb_loss_end = line[0:19] - data.append(hb_loss_start + " to " + hb_loss_end - + " Heartbeat loss failure on " + hb_loss_host - + " due to too many missing heartbeat " + data.append(hb_loss_start + " to " + hb_loss_end + + " Heartbeat loss failure on " + hb_loss_host + + " due to too many missing heartbeat " "messages (recovery over disabled)\n") hb_loss_start = hb_loss_end = hb_loss_host = None @@ -383,8 +383,8 @@ class Correlator: if (re.search("Error : " + host + " (.+) Heartbeat Loss ", line)): date = datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S") - if (date >= start_time - timedelta(minutes=1) - and date <= end_time): + if (date >= start_time - timedelta(minutes=1) and + date <= end_time): hb_loss = True break @@ -405,12 +405,12 @@ class Correlator: with open(file_path, "r") as daemon_failures: for line in daemon_failures: - if (re.search("\\d " + host - + " (.+) Failed to run the puppet manifest", + if (re.search("\\d " + host + + " (.+) Failed to run the puppet manifest", line)): date = datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S") - if (date >= start_time - timedelta(seconds=10) - and date <= end_time): + if (date >= start_time - timedelta(seconds=10) and + date <= end_time): daemon_fail = True break @@ -433,8 +433,8 @@ class Correlator: for line in puppet_errors: if "Error: " in line: date = datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S") - if (date >= start_time - timedelta(seconds=10) - and date <= end_time): + if (date >= start_time - timedelta(seconds=10) and + date <= end_time): puppet_log = line break @@ -460,13 +460,13 @@ class Correlator: if "force failed by SM" in line: host = re.findall("Error : (.+) is being", line)[0] if hostname == "all" or host == hostname: - data.append(line[0:19] + " " + host - + " force failed by SM\n") + data.append(line[0:19] + " " + host + + " force failed by SM\n") elif "Graceful Recovery Failed" in line: host = re.findall("Info : (.+) Task:", line)[0] if hostname == "all" or host == hostname: - data.append(line[0:19] + " " + host - + " graceful recovery failed\n") + data.append(line[0:19] + " " + host + + " graceful recovery failed\n") elif "MNFA ENTER" in line: mnfa_start = datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S") @@ -487,9 +487,9 @@ class Correlator: "%Y-%m-%dT%H:%M:%S") mnfa_duration -= mnfa_start mnfa_start = mnfa_start.strftime("%Y-%m-%dT%H:%M:%S") - data.append(mnfa_start + " Multi-node failure avoidance " - + "(duration: " + str(mnfa_duration) - + "; history:" + mnfa_hist + ")\n") + data.append(mnfa_start + " Multi-node failure avoidance " + + "(duration: " + str(mnfa_duration) + + "; history:" + mnfa_hist + ")\n") mnfa_start, mnfa_hist = None, "" @@ -506,9 +506,9 @@ class Correlator: svc_failed = re.findall( r"Service \((.+)\) is failed", line)[0] if hostname == "all" or host == hostname: - data.append(line[0:19] + " " + host - + " service failure (" + svc_failed - + ")\n") + data.append(line[0:19] + " " + host + + " service failure (" + svc_failed + + ")\n") return data @@ -524,7 +524,9 @@ class Correlator: # Open 'alarm' output file from alarm plugin and read it file_path = os.path.join(self.plugin_output_dir, "alarm") - + if not os.path.exists(file_path): + logger.debug("No alarms found") + return data with open(file_path, "r") as alarm: extract = False for line in alarm: @@ -547,8 +549,8 @@ class Correlator: temp = [] for entry in data: - temp.append(entry["name"] + " - set: " + str(entry["set"]) - + ", clear: " + str(entry["clear"]) + "\n") + temp.append(entry["name"] + " - set: " + str(entry["set"]) + + ", clear: " + str(entry["clear"]) + "\n") data = temp return data @@ -572,8 +574,8 @@ class Correlator: host = re.findall("Info : (.+) is ENABLED", line)[0] state = re.findall("is (.+)\n", line)[0].lower() if hostname == "all" or hostname in host: - data.append(line[0:19] + " " + host + " " + state - + "\n") + data.append(line[0:19] + " " + host + " " + + state + "\n") elif "locked-disabled" in line: host = re.findall( "Info : (.+) u?n?locked-disabled", line)[0] diff --git a/tools/collector/debian-scripts/report/execution_engine.py b/tools/collector/debian-scripts/report/execution_engine.py index 36372513..a1a45f55 100755 --- a/tools/collector/debian-scripts/report/execution_engine.py +++ b/tools/collector/debian-scripts/report/execution_engine.py @@ -1,6 +1,6 @@ ######################################################################## # -# Copyright (c) 2022 Wind River Systems, Inc. +# Copyright (c) 2022 - 2023 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # @@ -23,6 +23,7 @@ ######################################################################## import logging +import mmap import os import re import subprocess @@ -48,6 +49,10 @@ sys.dont_write_bytecode = True logger = logging.getLogger(__name__) +# regex expression used to get the hostname from the host dir name +# eg: chops '_20221201.213332' off of controller-0_20221201.213332 +regex_chop_bundle_date = r"_\d{8}\.\d{6}" + class ExecutionEngine: def __init__(self, opts, input_dir, output_dir): @@ -62,7 +67,9 @@ class ExecutionEngine: self.opts = opts self.hosts = {"controllers": {}, "workers": {}, "storages": {}} self.active_controller_directory = None + self.active_controller_hostname = None self.host_dirs = [] + self.hostnames = [] if not os.path.isdir(input_dir): logger.error("Error: Invalid input directory: %s", input_dir) @@ -89,37 +96,48 @@ class ExecutionEngine: except subprocess.CalledProcessError as e: logger.error(e) - # TODO: Need a better way to figure out the active controller - # Consider getting the system infop from all hosts. - # # Determine the active controller and load system info from it. for folder in (f.path for f in os.scandir(input_dir)): - logger.debug("folder: %s", os.path.basename(folder)) - # skip over the tarballs + logger.debug("base folder: %s", os.path.basename(folder)) + # skip over files (the tarballs) if not os.path.isdir(folder): continue + basename = os.path.basename(folder) + if basename == "report_analysis": + continue + + # Get the hostname from the host folder + hostname = re.sub(regex_chop_bundle_date, "", basename) + self.hostnames.append(hostname) + logger.debug("searching for active controller: %s" % hostname) host_dir = folder - extra_path = os.path.join(host_dir, "var", "extra") - database_path = os.path.join(host_dir, extra_path, "database") - host_info_path = os.path.join(host_dir, extra_path, "host.info") if os.path.isdir(host_dir): + extra_path = os.path.join(host_dir, "var", "extra") + # don't analyse a directory that doesn't contain # a 'var/extra' dir. if not os.path.exists(extra_path): + logger.warning("missing var/extra for %s" % hostname) continue + database_path = os.path.join(host_dir, extra_path, "database") + hostinfo_path = os.path.join(host_dir, extra_path, "host.info") + if os.path.exists(database_path): if os.listdir(database_path): + logger.info("Active Ctrl: %s" % hostname) self.active_controller_directory = folder + self.active_controller_hostname = hostname self.host_dirs.append(host_dir) logger.debug("Host Dirs: %s", self.host_dirs) - if os.path.exists(host_info_path): + # save host folder path based on nodetype + if os.path.exists(hostinfo_path): hostname, subfunction = self._extract_subfunction( - host_info_path) + hostinfo_path) if "controller" in subfunction: self.hosts["controllers"][hostname] = folder elif "worker" in subfunction: @@ -127,9 +145,8 @@ class ExecutionEngine: elif "storage" in subfunction: self.hosts["storages"][hostname] = folder - self.active_controller_directory = folder if not self.active_controller_directory: - raise ValueError("Active controller not found") + logger.error("Active Ctrl: NOT FOUND") def execute(self, plugins, output_dir): """Run a list of plugins @@ -178,6 +195,7 @@ class ExecutionEngine: os.path.join(folderpath, file) for file in plugin.state["files"] ], + plugin.state["exclude"], ) # creating output file @@ -186,37 +204,56 @@ class ExecutionEngine: f"substring_{hostname}", ) if self.opts.verbose: - logger.info("... output at " - + os.path.abspath(output_file)) - with open(output_file, "w") as file: - file.write( - f"Date range: {self.opts.start} until " - f"{self.opts.end}\n" - ) - file.write( - f"substrings: " - f"{' '.join(plugin.state['substring'])}\n" - ) - for line in events: - if line[-1] == "\n": - file.write(line) - else: - file.write(line + "\n") + logger.info("... output at " + + os.path.abspath(output_file)) + if events: + with open(output_file, "w") as file: + file.write( + f"Date range: {self.opts.start} until " + f"{self.opts.end}\n" + ) + file.write( + f"substrings: " + f"{' '.join(plugin.state['substring'])}\n" + ) + for line in events: + if line[-1] == "\n": + file.write(line) + else: + file.write(line + "\n") else: if plugin.state["algorithm"] == algorithms.SYSTEM_INFO: - for host_dir in self.host_dirs: - info = system_info(host_dir) - system_info_output = os.path.join(plugin_output_dir, - "system_info") - with open(system_info_output, "w") as file: - for i in info: - file.write(i + "\n") - for k, v in self.hosts.items(): - file.write(f"{k}: {','.join(v.keys())}\n") + # Get system info of the active controller first + # and then put the system info of each host in the + # system info output folder. + system_info_output = os.path.join(plugin_output_dir, + "system_info") + if os.path.exists(system_info_output): + os.remove(system_info_output) - if self.opts.verbose: - logger.info(processing + ", output at " + - os.path.abspath(system_info_output)) + hostname = None + host_dir = None + if self.active_controller_directory is None: + hostname = re.sub(regex_chop_bundle_date, "", + os.path.basename(self.host_dirs[0])) + host_dir = self.host_dirs[0] + else: + hostname = self.active_controller_hostname + host_dir = self.active_controller_directory + + system_info(hostname, host_dir, + system_info_output, + self.hosts, True) + + for host_dir in self.host_dirs: + if host_dir != self.active_controller_directory: + hostname = re.sub(regex_chop_bundle_date, "", + os.path.basename(host_dir)) + system_info(hostname, + host_dir, + system_info_output, + None, + False) elif plugin.state["algorithm"] == algorithms.AUDIT: hosts = {} @@ -284,7 +321,6 @@ class ExecutionEngine: file.write(f"{k}:\n") for date in v["dates"]: file.write(f" {date}\n") - # creating output log file with open(log_output, "w") as file: for k, v in logs.items(): @@ -312,14 +348,16 @@ class ExecutionEngine: self._create_output_file( "maintenance_errors", plugin_output_dir, maintenance_errors(self.hosts, self.opts.start, - self.opts.end), + self.opts.end, + plugin.state["exclude"]), processing ) elif plugin.state["algorithm"] == algorithms.DAEMON_FAILURES: self._create_output_file( "daemon_failures", plugin_output_dir, daemon_failures(self.hosts, self.opts.start, - self.opts.end), + self.opts.end, + plugin.state["exclude"]), processing ) elif plugin.state["algorithm"] == algorithms.STATE_CHANGES: @@ -330,6 +368,35 @@ class ExecutionEngine: processing ) + # Dump a summary of data found by the plugins + if os.path.exists(plugin_output_dir): + + # Print a summary of the logs/data gathers by the plugins + empty_files = "" + logger.info("Plugin Results:\n") + for fn in os.listdir(plugin_output_dir): + filename = os.path.join(plugin_output_dir, fn) + with open(filename, "r+") as f: + # Show how much data is in each plugins output file + if os.path.isfile(filename) and os.path.getsize(filename): + buf = mmap.mmap(f.fileno(), 0) + entries = 0 + readline = buf.readline + while readline(): + entries += 1 + if fn == "system_info": + logger.info(filename) + else: + logger.info("%s has %d entries" % + (filename, entries)) + else: + empty_files += fn + " " + if empty_files: + logger.info("\n... nothing found by plugins: %s" % empty_files) + else: + logger.error("Plugin output dir missing: %s" % plugin_output_dir) + sys.exit("... exiting") + # Running the correlator and printing the output from it self.run_correlator(output_dir, plugin_output_dir) @@ -357,36 +424,46 @@ class ExecutionEngine: failures.append("\nTotal failures found: " + str(failures_len) + "\n") events.append("\nTotal events found: " + str(events_len) + "\n") alarms.append("\nTotal alarms found: " + str(alarms_len) + "\n") - state_changes.append("\nTotal state changes found: " - + str(state_changes_len) + "\n") + state_changes.append("\nTotal state changes found: " + + str(state_changes_len) + "\n") - # TODO: Put at the end of the report - logger.info("\nRunning correlator... view report at " - + output_dir) - self._create_output_file("correlator_failures", output_dir, + logger.info("\nCorrelated Results:\n") + self._create_output_file("failures", output_dir, failures, "") - self._create_output_file("correlator_events", output_dir, + self._create_output_file("events", output_dir, events, "") - self._create_output_file("correlator_alarms", output_dir, + self._create_output_file("alarms", output_dir, alarms, "") - self._create_output_file("correlator_state_changes", output_dir, + self._create_output_file("state_changes", output_dir, state_changes, "") + max = 0 + for sl in [events_len, alarms_len, state_changes_len, failures_len]: + if len(str(sl)) > max: + max = len(str(sl)) if not self.opts.verbose: - logger.info("Events : " + str(events_len)) - logger.info("Alarms : " + str(alarms_len)) - logger.info("State Changes: " + str(state_changes_len)) - logger.info("Failures : " + str(failures_len)) + logger.info("Events : " + str(events_len) + + " " * (max - len(str(events_len))) + + " " + output_dir + "/events") + logger.info("Alarms : " + str(alarms_len) + + " " * (max - len(str(alarms_len))) + + " " + output_dir + "/alarms") + logger.info("State Changes: " + str(state_changes_len) + + " " * (max - len(str(state_changes_len))) + + " " + output_dir + "/state_changes") + logger.info("Failures : " + str(failures_len) + + " " * (max - len(str(failures_len))) + + " " + output_dir + "/failures") for f in failures[:-1]: if "Uncontrolled swact" in f: - logger.info(f[0:19] + " " - + re.findall("active controller:? (.+)\n", - f)[0] + " uncontrolled swact") + logger.info(f[0:19] + " " + + re.findall("active controller:? (.+)\n", + f)[0] + " uncontrolled swact") elif "failure on" in f: host = re.findall(r"failure on ([^\s]+) ", f)[0] - logger.info(f[0:19] + " " + host + " " - + re.findall("^(.+) failure on ", - f[43:])[0].lower() + " failure") + logger.info(f[0:19] + " " + host + " " + + re.findall("^(.+) failure on ", + f[43:])[0].lower() + " failure") else: logger.info(f[:-1]) else: @@ -405,9 +482,9 @@ class ExecutionEngine: logger.info(k + ": " + str(v) + " time(s)") logger.info("\nAlarms: " + str(alarms_len)) - logger.info("The full list of alarms can be found at " - + os.path.abspath(output_dir) - + "/correlator_alarms") + logger.info("The full list of alarms can be found at " + + os.path.abspath(output_dir) + + "/alarms") # Dictionary to keep track of number of times state changes # happens on each host @@ -451,8 +528,8 @@ class ExecutionEngine: else: file.write(i + "\n") if self.opts.verbose: - output = ("... output at " - + os.path.abspath(os.path.join(directory, filename))) + output = ("... output at " + + os.path.abspath(os.path.join(directory, filename))) if processing == "": logger.info(output) else: diff --git a/tools/collector/debian-scripts/report/plugin.py b/tools/collector/debian-scripts/report/plugin.py index 3268e976..97797f49 100755 --- a/tools/collector/debian-scripts/report/plugin.py +++ b/tools/collector/debian-scripts/report/plugin.py @@ -1,6 +1,6 @@ ######################################################################## # -# Copyright (c) 2022 Wind River Systems, Inc. +# Copyright (c) 2022 - 2023 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # @@ -39,6 +39,7 @@ class Plugin: "files": [], "hosts": [], "substring": [], + "exclude": [], "alarm_exclude": [], "entity_exclude": [], "start": None, @@ -78,6 +79,10 @@ class Plugin: line (string): Line from plugin file to extract """ + # allow plugins to have empty lines or comments starting with # + if len(line) <= 1 or line[0] == '#': + return + # split string from first '=', left side is label right side is value data = line.strip().split("=", 1) if len(data) <= 1: @@ -85,11 +90,17 @@ class Plugin: label = data[0] value = data[1] label = label.replace(" ", "") + + # ignore labels that don't start with an alphabetical char + if label[0].isalpha() is False: + raise ValueError("Invalid label value") try: if label == "algorithm": self.state["algorithm"] = value.replace(" ", "") elif label == "substring": self.state["substring"].append(data[1]) + elif label == "exclude": + self.state["exclude"].append(data[1]) elif label == "hosts": self.state["hosts"] = value.replace(" ", "").split(",") elif label == "alarm_exclude": diff --git a/tools/collector/debian-scripts/report/plugin_algs/audit.py b/tools/collector/debian-scripts/report/plugin_algs/audit.py index d2c93fe1..16223244 100644 --- a/tools/collector/debian-scripts/report/plugin_algs/audit.py +++ b/tools/collector/debian-scripts/report/plugin_algs/audit.py @@ -46,8 +46,8 @@ def audit(start, end, audit_log_path): # Counts sum of audits from all subclouds ] INDEX_MIDDLE_WORD = 1 - data = [("These rates and totals represent the sum of audits from " - + "all subclouds")] + data = [("These rates and totals represent the sum of audits " + + "from all subclouds")] def command(text): diff --git a/tools/collector/debian-scripts/report/plugin_algs/daemon_failures.py b/tools/collector/debian-scripts/report/plugin_algs/daemon_failures.py index dd879bb1..65979b45 100644 --- a/tools/collector/debian-scripts/report/plugin_algs/daemon_failures.py +++ b/tools/collector/debian-scripts/report/plugin_algs/daemon_failures.py @@ -19,7 +19,7 @@ import os from plugin_algs.substring import substring -def daemon_failures(hosts, start, end): +def daemon_failures(hosts, start, end, exclude_list=None): """Daemon failures algorithm Presents all "Failed to run the puppet manifest" log messages in the system @@ -37,6 +37,6 @@ def daemon_failures(hosts, start, end): daemon_files.append(daemon_path) daemon_substrings = ["Failed to run the puppet manifest"] - data = substring(start, end, daemon_substrings, daemon_files) + data = substring(start, end, daemon_substrings, daemon_files, exclude_list) return sorted(data) diff --git a/tools/collector/debian-scripts/report/plugin_algs/maintenance_errors.py b/tools/collector/debian-scripts/report/plugin_algs/maintenance_errors.py index c9c8b19a..41a745e8 100644 --- a/tools/collector/debian-scripts/report/plugin_algs/maintenance_errors.py +++ b/tools/collector/debian-scripts/report/plugin_algs/maintenance_errors.py @@ -19,7 +19,7 @@ import os from plugin_algs.substring import substring -def maintenance_errors(hosts, start, end): +def maintenance_errors(hosts, start, end, exclude_list=None): """Maintenance errors algorithm Presents maintenance errors and other relevant log messages in system, such as "Configuration failure" @@ -51,6 +51,6 @@ def maintenance_errors(hosts, start, end): "auto recovery disabled", "Graceful Recovery Failed", "MNFA ENTER", "MNFA EXIT", "MNFA POOL"] - data = substring(start, end, mtc_substrings, mtc_files) + data = substring(start, end, mtc_substrings, mtc_files, exclude_list) return sorted(data) diff --git a/tools/collector/debian-scripts/report/plugin_algs/substring.py b/tools/collector/debian-scripts/report/plugin_algs/substring.py index 7667c091..d7c2fb71 100644 --- a/tools/collector/debian-scripts/report/plugin_algs/substring.py +++ b/tools/collector/debian-scripts/report/plugin_algs/substring.py @@ -1,6 +1,6 @@ ######################################################################## # -# Copyright (c) 2022 Wind River Systems, Inc. +# Copyright (c) 2022 - 2023 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # @@ -24,7 +24,7 @@ import subprocess logger = logging.getLogger(__name__) -def substring(start, end, substr, files): +def substring(start, end, substr, files, exclude_list=None): """Substring algorithm Looks for all substrings in substr within files @@ -33,7 +33,7 @@ def substring(start, end, substr, files): end (string): End time for analysis substr (string list): List of substrings to look for files (string list): List of absolute filepaths to search in - + exclude_list (string list): list of strings to exclude from report Errors: FileNotFoundError """ @@ -49,15 +49,17 @@ def substring(start, end, substr, files): if (re.search("controller-1_(.+)/var/log/mtcAgent.log", file)): continue - raise FileNotFoundError(f"File not found: {file}") + else: + data.append("File not found: " + file) + continue cont = True # Searching through file command = (f"""grep -Ea "{'|'.join(s for s in substr)}" """ f"""{file} 2>/dev/null""") status = _continue(start, end, file) - if (status == CONTINUE_CURRENT - or status == CONTINUE_CURRENT_OLD): + if (status == CONTINUE_CURRENT or + status == CONTINUE_CURRENT_OLD): # continue with current file if status == CONTINUE_CURRENT: cont = False @@ -70,8 +72,8 @@ def substring(start, end, substr, files): f"""{file}.{n} 2>/dev/null""") status = _continue(start, end, f"{file}.{n}") - if (status == CONTINUE_CURRENT - or status == CONTINUE_CURRENT_OLD): + if (status == CONTINUE_CURRENT or + status == CONTINUE_CURRENT_OLD): if status == CONTINUE_CURRENT: cont = False _evaluate_substring(start, end, data, command) @@ -85,8 +87,8 @@ def substring(start, end, substr, files): status = _continue(start, end, f"{file}.{n}.gz", compressed=True) - if (status == CONTINUE_CURRENT - or status == CONTINUE_CURRENT_OLD): + if (status == CONTINUE_CURRENT or + status == CONTINUE_CURRENT_OLD): if status == CONTINUE_CURRENT: cont = False _evaluate_substring(start, end, data, command) @@ -97,6 +99,17 @@ def substring(start, end, substr, files): logger.error(e) continue + # now remove any logs that contain substrings in the exclude_list + if exclude_list: + filtered_data = [] + for e in data: + found = False + for exclude in exclude_list: + if e.find(exclude) != -1: + found = True + if found is False: + filtered_data.append(e) + return sorted(filtered_data) return sorted(data) diff --git a/tools/collector/debian-scripts/report/plugin_algs/system_info.py b/tools/collector/debian-scripts/report/plugin_algs/system_info.py index ffa50773..ff1494d5 100644 --- a/tools/collector/debian-scripts/report/plugin_algs/system_info.py +++ b/tools/collector/debian-scripts/report/plugin_algs/system_info.py @@ -1,6 +1,6 @@ ######################################################################## # -# Copyright (c) 2022 Wind River Systems, Inc. +# Copyright (c) 2022 -2023 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # @@ -17,47 +17,112 @@ import os import re -def system_info(host_dir): +def system_info(hostname, host_dir, output_dir, hosts, loud=False): """System info algorithm Presents basic information about the system, such as the build type Parameters: - host_dir (string): path to the collect host dir + hostname (string): make of the host + host_dir (string): path to the collect host dir + output_dir (string): path to the file to store the system info + hosts (string): list of host objects + loud (boolean): when True print system info to stdout + + Returns: nothing """ data = [] - with open( - os.path.join(host_dir, "etc", "platform", "platform.conf") - ) as file: + + if host_dir is None: + raise ValueError("system_info:No specified host dir") + + # from /etc/platform/platform.conf + platform_conf = os.path.join(host_dir, "etc", "platform", "platform.conf") + + # ... load the following items first + with open(platform_conf) as file: for line in file: if "system_mode" in line: - data.append( - f"System Mode: " - f"{re.match('^system_mode=(.*)', line).group(1)}" - ) + val = re.match('^system_mode=(.*)', line).group(1) + data.append(f"System Mode: {val}") elif "system_type" in line: - data.append( - f"System Type: " - f"{re.match('^system_type=(.*)', line).group(1)}" - ) + val = re.match('^system_type=(.*)', line).group(1) + data.append(f"System Type: {val}") elif "distributed_cloud_role" in line: role = re.match('^distributed_cloud_role=(.*)', line).group(1) - data.append(f"Distributed cloud role: {role}") + data.append(f"DC Role : {role}") elif "sw_version" in line: - data.append( - f"SW Version: " - f"{re.match('^sw_version=(.*)', line).group(1)}" - ) + val = re.match('^sw_version=(.*)', line).group(1) + data.append(f"S/W Version: {val}") + # ... followed by these items + with open(platform_conf) as file: + for line in file: + if "nodetype" in line: + val = re.match('^nodetype=(.*)', line).group(1) + data.append(f"Node Type : {val}") + elif "subfunction" in line: + val = re.match('^subfunction=(.*)', line).group(1) + data.append(f"subfunction: {val}") + elif "oam_interface" in line: + val = re.match('^oam_interface=(.*)', line).group(1) + data.append(f"OAM Iface : {val}") + elif "management_interface" in line: + val = re.match('^management_interface=(.*)', line).group(1) + data.append(f"Mgmt Iface : {val}") + elif "cluster_host_interface" in line: + val = re.match('^cluster_host_interface=(.*)', line).group(1) + data.append(f"Clstr Iface: {val}") + + # /etc/os-release info + with open( + os.path.join(host_dir, "etc", "os-release") + ) as file: + for line in file: + if "PRETTY_NAME" in line: + val = (re.match('^PRETTY_NAME=(.*)', line).group(1)) + val = val.strip('\"') + data.append(f"OS Release : {val}") + + # /etc/build.info with open( os.path.join(host_dir, "etc", "build.info") ) as file: for line in file: if "BUILD_TYPE" in line: - data.append( - f"Build Type: " - f"{re.match('^BUILD_TYPE=(.*)', line).group(1)}" - ) - elif re.match("^OS=(.*)", line): - data.append(f"OS: {re.match('^OS=(.*)', line).group(1)}") + val = (re.match('^BUILD_TYPE=(.*)', line).group(1)) + val = val.strip('\"') + data.append(f"Build Type : {val}") + elif "BUILD_DATE" in line: + val = (re.match('^BUILD_DATE=(.*)', line).group(1)) + val = val.strip('\"') + data.append(f"Build Date : {val}") + elif "BUILD_DIR" in line: + val = (re.match('^BUILD_DIR=(.*)', line).group(1)) + val = val.strip('\"') + data.append(f"Build Dir : {val}") - return data + with open(output_dir, "a") as file: + dashs = "-" * len(hostname) + file.write("\n" + hostname + "\n" + dashs + "\n") + for i in data: + file.write(i + "\n") + if loud is True: + print(i) + + if hosts is not None: + for k, v in hosts.items(): + if not len(v.keys()): + continue + if k == "storages": + k += " " + if k == "workers": + k += " " + file.write(f"{k}: {','.join(v.keys())}\n") + if loud is True: + print(f"{k}: {','.join(v.keys())}") + + # create an empty line following the system info dump + if loud is True: + print("") + + return diff --git a/tools/collector/debian-scripts/report/plugins/maintenance_errors b/tools/collector/debian-scripts/report/plugins/maintenance_errors index 1f5bc144..92ebc9b9 100755 --- a/tools/collector/debian-scripts/report/plugins/maintenance_errors +++ b/tools/collector/debian-scripts/report/plugins/maintenance_errors @@ -1 +1,2 @@ algorithm=maintenance_errors +exclude=task clear diff --git a/tools/collector/debian-scripts/report/plugins/sm_errors b/tools/collector/debian-scripts/report/plugins/sm_errors new file mode 100755 index 00000000..fc0b6b89 --- /dev/null +++ b/tools/collector/debian-scripts/report/plugins/sm_errors @@ -0,0 +1,14 @@ +algorithm=substring +files=var/log/sm.log +hosts=controllers + +# logs to exclude +substring=ERROR: sm +exclude=Failed to set alarm +exclude=Failed to set log +exclude=Failed to clear alarm +exclude=Failed to get all alarms +exclude=Failed to query service based on pid +exclude=Failed to look up interface name +exclude=Failed to stop service heartbeat thread +exclude=Heartbeat is not required diff --git a/tools/collector/debian-scripts/report/plugins/substring b/tools/collector/debian-scripts/report/plugins/substring index bae11679..25c03ba3 100755 --- a/tools/collector/debian-scripts/report/plugins/substring +++ b/tools/collector/debian-scripts/report/plugins/substring @@ -1,5 +1,5 @@ algorithm=substring -files=var/log/mtcAgent.log, var/log/sm.log +files=var/log/mtcAgent.log hosts=controllers substring=operation failed substring=Failed to send message diff --git a/tools/collector/debian-scripts/report/report.py b/tools/collector/debian-scripts/report/report.py index 3411ff71..9c7bbe43 100755 --- a/tools/collector/debian-scripts/report/report.py +++ b/tools/collector/debian-scripts/report/report.py @@ -446,6 +446,7 @@ bundle_names = [] bundles = [] ignore_list = [analysis_folder_name] ignore_list += ["apps", "horizon", "lighttpd", "lost+found", "sysinv-tmpdir"] +ignore_list += ["patch-api-proxy-tmpdir", "platform-api-proxy-tmpdir"] with open(os.path.join(output_dir, "untar.log"), "a") as logfile: for obj in (os.scandir(input_dir)): @@ -463,6 +464,9 @@ with open(os.path.join(output_dir, "untar.log"), "a") as logfile: date_time = obj.name[-15:] if args.debug: logger.debug("Found Dir : %s : %s", obj.name, date_time) + elif os.path.islink(obj.path): + # ignore sym links + continue else: if not tarfile.is_tarfile(obj.path): continue @@ -559,6 +563,7 @@ elif args.debug: # create the output directory ; report_analysis output_dir = os.path.join(path_file, analysis_folder_name) +print("\nReport: %s\n" % output_dir) if not os.path.exists(output_dir): os.makedirs(output_dir, exist_ok=True) @@ -567,7 +572,7 @@ try: engine = ExecutionEngine(args, path_file, output_dir) except ValueError as e: logger.error(str(e)) - sys.exit("Confirm you are running the report tool on a collect bundle") + logger.error("Confirm you are running the report tool on a collect bundle") if args.algorithm: plugins.append(Plugin(opts=vars(args))) diff --git a/tools/collector/debian/deb_folder/rules b/tools/collector/debian/deb_folder/rules index e99cc98b..a06aec28 100755 --- a/tools/collector/debian/deb_folder/rules +++ b/tools/collector/debian/deb_folder/rules @@ -30,7 +30,7 @@ override_dh_auto_install: install -m 755 -p expect_done $(ROOT)/usr/local/sbin/expect_done install -m 755 -p mariadb-cli.sh $(ROOT)/usr/local/sbin/mariadb-cli - # Report Tool + # Report Tool install -m 755 -p report/report.py $(ROOT)/usr/local/bin/report/report.py install -m 755 -p report/execution_engine.py $(ROOT)/usr/local/bin/report/execution_engine.py install -m 755 -p report/algorithms.py $(ROOT)/usr/local/bin/report/algorithms.py @@ -38,7 +38,7 @@ override_dh_auto_install: install -m 755 -p report/correlator.py $(ROOT)/usr/local/bin/report/correlator.py install -m 755 -p report/README $(ROOT)/usr/local/bin/report/README - # Report Tool Plugin Algorithms + # Report Tool Plugin Algorithms install -m 755 -p report/plugin_algs/alarm.py $(ROOT)/usr/local/bin/report/plugin_algs/alarm.py install -m 755 -p report/plugin_algs/audit.py $(ROOT)/usr/local/bin/report/plugin_algs/audit.py install -m 755 -p report/plugin_algs/daemon_failures.py $(ROOT)/usr/local/bin/report/plugin_algs/daemon_failures.py @@ -51,20 +51,21 @@ override_dh_auto_install: install -m 755 -p report/plugin_algs/swact_activity.py $(ROOT)/usr/local/bin/report/plugin_algs/swact_activity.py install -m 755 -p report/plugin_algs/system_info.py $(ROOT)/usr/local/bin/report/plugin_algs/system_info.py - # Report Tool Plugins + # Report Tool Plugins install -m 755 -p report/plugins/alarm $(ROOT)/usr/local/bin/report/plugins/alarm install -m 755 -p report/plugins/daemon_failures $(ROOT)/usr/local/bin/report/plugins/daemon_failures install -m 755 -p report/plugins/heartbeat_loss $(ROOT)/usr/local/bin/report/plugins/heartbeat_loss install -m 755 -p report/plugins/maintenance_errors $(ROOT)/usr/local/bin/report/plugins/maintenance_errors install -m 755 -p report/plugins/process_failures $(ROOT)/usr/local/bin/report/plugins/process_failures install -m 755 -p report/plugins/puppet_errors $(ROOT)/usr/local/bin/report/plugins/puppet_errors + install -m 755 -p report/plugins/sm_errors $(ROOT)/usr/local/bin/report/plugins/sm_errors install -m 755 -p report/plugins/state_changes $(ROOT)/usr/local/bin/report/plugins/state_changes install -m 755 -p report/plugins/substring $(ROOT)/usr/local/bin/report/plugins/substring install -m 755 -p report/plugins/swact_activity $(ROOT)/usr/local/bin/report/plugins/swact_activity install -m 755 -p report/plugins/system_info $(ROOT)/usr/local/bin/report/plugins/system_info install -m 755 -p report/plugins/substring_hosts $(SYSCONFDIR)/collect/plugins/substring_hosts - # Collect Plugins + # Collect Plugins install -m 755 -p collect_sysinv.sh $(SYSCONFDIR)/collect.d/collect_sysinv install -m 755 -p collect_psqldb.sh $(SYSCONFDIR)/collect.d/collect_psqldb install -m 755 -p collect_mariadb.sh $(SYSCONFDIR)/collect.d/collect_mariadb