diff --git a/tools/collector/debian-scripts/collect b/tools/collector/debian-scripts/collect index bdae3f35..4c05156b 100755 --- a/tools/collector/debian-scripts/collect +++ b/tools/collector/debian-scripts/collect @@ -2941,6 +2941,56 @@ collect_subclouds() fi } +############################################################################ +# +# Name : get_report_tool +# +# Purpose : Fetch report tool from current host +# +# Parameters: $1 - local path destination +# +############################################################################ + +function get_report_tool() +{ + local local_dest=${1} + + mkdir -p ${local_dest} + cp -r /usr/local/bin/report/tool ${local_dest} + + local rc=${?} + if [ ${rc} -ne ${PASS} ] ; then + report_error "failed to get report tool from /usr/local/bin" ${rc} + else + ilog "copied report tool from host" + fi +} + +############################################################################ +# +# Name : get_report_plugins +# +# Purpose : Fetch plugins for report tool from current host +# +# Parameters: $1 - local path destination +# +############################################################################ + +function get_report_plugins() +{ + local local_dest=${1} + + mkdir -p ${local_dest} + cp -r /etc/collect/plugins ${local_dest} + + local rc=${?} + if [ ${rc} -ne ${PASS} ] ; then + report_error "failed to get report plugins from /etc/collect" ${rc} + else + ilog "copied plugins for report tool from host" + fi +} + ############################################################################ # # Handle subcloud and system hosts batched collect @@ -3031,6 +3081,12 @@ echo -n "creating ${COLLECT_TYPE} tarball ${TARBALL_NAME} ... " remove_file_local ${COLLECT_ERROR_LOG} remove_file_local ${HOST_COLLECT_ERROR_LOG} +get_report_tool ${COLLECT_DIR}/report +get_report_plugins ${COLLECT_DIR}/report + +cd ${COLLECT_DIR} +tar -czf report_tool.tgz report +rm -r report /usr/bin/expect << EOF log_user ${USER_LOG_MODE} diff --git a/tools/collector/debian/deb_folder/collector.install b/tools/collector/debian/deb_folder/collector.install index 05221956..142bfa80 100644 --- a/tools/collector/debian/deb_folder/collector.install +++ b/tools/collector/debian/deb_folder/collector.install @@ -3,3 +3,4 @@ etc/collect.d/* /etc/collect.d usr/local/sbin/* /usr/local/sbin usr/local/bin/collect /usr/local/bin usr/sbin/collect /usr/sbin +/usr/local/bin/report/* /usr/local/bin/report diff --git a/tools/collector/debian/deb_folder/rules b/tools/collector/debian/deb_folder/rules index 6f7a99a7..da6fa289 100755 --- a/tools/collector/debian/deb_folder/rules +++ b/tools/collector/debian/deb_folder/rules @@ -13,8 +13,10 @@ override_dh_auto_install: install -m 755 -d $(SYSCONFDIR)/collect.d install -m 755 -d $(SYSCONFDIR)/collect + install -m 755 -d $(SYSCONFDIR)/collect/plugins # Report Tool install -m 755 -d $(ROOT)/usr/local/sbin install -m 755 -d $(ROOT)/usr/local/bin + install -m 755 -d $(ROOT)/usr/local/bin/report/tool # Report Tool install -m 755 -d $(SBINDIR) install -m 755 -p collect $(ROOT)/usr/local/sbin/collect @@ -26,6 +28,24 @@ override_dh_auto_install: install -m 755 -p expect_done $(ROOT)/usr/local/sbin/expect_done install -m 755 -p mariadb-cli.sh $(ROOT)/usr/local/sbin/mariadb-cli + # Report Tool + install -m 755 -p report/report.py $(ROOT)/usr/local/bin/report/tool/report.py + install -m 755 -p report/execution_engine.py $(ROOT)/usr/local/bin/report/tool/execution_engine.py + install -m 755 -p report/algorithms.py $(ROOT)/usr/local/bin/report/tool/algorithms.py + install -m 755 -p report/plugin.py $(ROOT)/usr/local/bin/report/tool/plugin.py + install -m 755 -p report/correlator.py $(ROOT)/usr/local/bin/report/tool/correlator.py + install -m 755 -p report/README $(ROOT)/usr/local/bin/report/tool/README + install -m 755 -p report/plugins/alarm $(SYSCONFDIR)/collect/plugins/alarm + install -m 755 -p report/plugins/daemon_failures $(SYSCONFDIR)/collect/plugins/daemon_failures + install -m 755 -p report/plugins/heartbeat_loss $(SYSCONFDIR)/collect/plugins/heartbeat_loss + install -m 755 -p report/plugins/maintenance_errors $(SYSCONFDIR)/collect/plugins/maintenance_errors + install -m 755 -p report/plugins/process_failures $(SYSCONFDIR)/collect/plugins/process_failures + install -m 755 -p report/plugins/puppet_errors $(SYSCONFDIR)/collect/plugins/puppet_errors + install -m 755 -p report/plugins/state_changes $(SYSCONFDIR)/collect/plugins/state_changes + install -m 755 -p report/plugins/substring $(SYSCONFDIR)/collect/plugins/substring + install -m 755 -p report/plugins/swact_activity $(SYSCONFDIR)/collect/plugins/swact_activity + install -m 755 -p report/plugins/system_info $(SYSCONFDIR)/collect/plugins/system_info + install -m 755 -p collect_sysinv.sh $(SYSCONFDIR)/collect.d/collect_sysinv install -m 755 -p collect_psqldb.sh $(SYSCONFDIR)/collect.d/collect_psqldb install -m 755 -p collect_mariadb.sh $(SYSCONFDIR)/collect.d/collect_mariadb diff --git a/tools/collector/report/README b/tools/collector/report/README index 12a8bd49..b439a313 100644 --- a/tools/collector/report/README +++ b/tools/collector/report/README @@ -13,26 +13,28 @@ SELECT_NODES_20220527.193605 │  ├── etc │  ├── root │  └── var -├── plugins (where the plugin files will be placed) -│  ├── alarm_plugin_example -│  └── substring_plugin_example ├── report - └── tool (where the tool will be placed) + ├── plugins (where the plugin files will be placed) + │  ├── alarm + │  ├── substring + │  └── ... + ├── tool (where the tool will be placed) └── output (where the output files will be placed) -> cat plugins/alarm_plugin_example +> cat plugins/alarm algorithm=alarm -alarm_ids=400.,401. -entity_ids = host=controller-0 +alarm_ids=400., 401. +entity_ids=host=controller-0, host=controller-1 -> cat plugins/substring_plugin_example +> cat plugins/substring algorithm=substring -files=var/log/mtcAgent.log +files=var/log/mtcAgent.log, var/log/sm.log hosts=controllers substring=operation failed +substring=Failed to send message > report/tool/report.py --start 20220501 --end 20220530 @@ -41,7 +43,8 @@ The tool also provides default values, more details are in 'report.py -h'. The substring algorithm creates an output file for every host of the specified host type. The files will contain log events within the -provided date range containing the substring 'operation failed'. +provided date range containing the substring 'operation failed' and 'Failed +to send message'. The alarm algorithm creates two output file: 'log' and 'alarm' 'log' contains customer log messages created within the provided date range, @@ -53,10 +56,14 @@ Here is the report directory after running the above command report ├── output -│ └── 20220815.140008 (time in utc when tool was ran) -│ ├── alarm -│ ├── controller-0_substring_plugin_example_substring -│ ├── controller-1_substring_plugin_example_substring -│ ├── report.log (log file for report tool) -│ └── log -└── tool (where the report tool is) +│ └── SELECT_NODES_20220527.193605 (collect bundle that the report tool was run on) +│ ├── plugins (output files for plugins) +│ │ ├── alarm +│ │ └── ... +│ ├── correlator_failures +│ ├── correlator_events +│ ├── correlator_state_changes +│ ├── report.log (log file for report tool) +│ └── untar.log (log file for untarring collect bundle and host tar files) +├── plugins (where the plugins files are) +└── tool (where the report tool is) diff --git a/tools/collector/report/algorithms.py b/tools/collector/report/algorithms.py index 2b206455..c65c67e2 100644 --- a/tools/collector/report/algorithms.py +++ b/tools/collector/report/algorithms.py @@ -9,8 +9,12 @@ # Algorithm string constants ALARM = "alarm" AUDIT = "audit" -PROCESS_FAILURE = "process_failure" -PUPPET = "puppet" +DAEMON_FAILURES = "daemon_failures" +HEARTBEAT_LOSS = "heartbeat_loss" +MAINTENANCE_ERR = "maintenance_errors" +PROCESS_FAILURES = "process_failures" +PUPPET_ERRORS = "puppet_errors" +STATE_CHANGES = "state_changes" SUBSTRING = "substring" -SWACT = "swact" +SWACT_ACTIVITY = "swact_activity" SYSTEM_INFO = "system_info" diff --git a/tools/collector/report/correlator.py b/tools/collector/report/correlator.py new file mode 100755 index 00000000..76bd94b7 --- /dev/null +++ b/tools/collector/report/correlator.py @@ -0,0 +1,471 @@ +######################################################################## +# +# Copyright (c) 2022 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# +######################################################################## +# +# This file contains the Correlator class. +# The Correlator class contains algorithms that search for failures. +# +# The Correlator class reads through all the output files created by +# the plugins and detects failures. A summary of the failures and their +# causes are printed to standard output and an output file is created +# in the report directory. +# +######################################################################## + +from datetime import datetime +from datetime import timedelta +import logging +import os +import re + + +logger = logging.getLogger(__name__) + + +class Correlator: + def __init__(self, plugin_output_dir): + """Constructor for the Correlator class + + Parameters: + plugin_output_dir (string): Path to directory with output files from plugins + """ + self.plugin_output_dir = plugin_output_dir + + def run(self, hostname): + """ + Searches through the output files created by the plugins for failures and + determines their causes, as well as extracts significant events and state changes + + Errors: + FileNotFoundError + """ + failures = [] + try: + failures += self.uncontrolled_swact() + except FileNotFoundError as e: + logger.error(e) + + try: + failures += self.mtc_errors() + except FileNotFoundError as e: + logger.error(e) + + events = [] + try: + events += self.get_events(hostname) + except FileNotFoundError as e: + logger.error(e) + + state_changes = [] + try: + state_changes += self.get_state_changes(hostname) + except FileNotFoundError as e: + logger.error(e) + + return sorted(failures), sorted(events), sorted(state_changes) + + def uncontrolled_swact(self): + """ + Searches through the output file created by the swact activity plugin for + uncontrolled swacts and determines their causes through other indicators + + Errors: + FileNotFoundError + """ + data = [] + + # Variables to keep track of indicators for failure causes + start_time = end_time = svc_failed = None + ctrlr_down = None # Active controller that went down, causing swact + ctrlr_svc_fail = None # Active controller where service failed twice in 2 minutes + ctrlr_link_down = None # Original active controller when link between two went down + hb_loss = active_failed = go_active_failed = link_down = False + + # Open output file from swact activity plugin and read it + file_path = os.path.join(self.plugin_output_dir, "swact_activity") + swact_activity = open(file_path, "r") + line = swact_activity.readline() + + while line: + if "Uncontrolled swact" in line and not start_time: + start_time = datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S") + if "Host from active to failed, Peer from standby to active" in line: + link_down = True + ctrlr_link_down = re.findall("\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3} (.+) sm:", + line)[0] + elif re.search("Neighbor (.+) is now in the down", line) and start_time and not ctrlr_down: + ctrlr_down = re.findall("Neighbor \((.+)\) received event", line)[0] + elif re.search("Service (.+) is failed and has reached max failures", line) and not svc_failed: + svc_failed = re.findall("Service \((.+)\) is failed", line)[0] + ctrlr_svc_fail = re.findall("\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3} (.+) sm:", line)[0] + elif svc_failed and re.search("active-failed\s+\| disabling-failed\s+\| " + svc_failed, line): + if re.search("\| go-active-failed\s+\|", line): + go_active_failed = True + else: + active_failed = True + elif "Swact update" in line and start_time and not end_time: + end_time = datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S") + if ctrlr_down: + try: + hb_loss = self.search_hb_loss(start_time, end_time, ctrlr_down) + except FileNotFoundError as e: + logger.error(e) + + start_time = start_time.strftime("%Y-%m-%dT%H:%M:%S") + end_time = end_time.strftime("%Y-%m-%dT%H:%M:%S") + if link_down: + data.append(start_time + " to " + end_time + " Uncontrolled swact, refer to SM " + "logs for in-depth analysis, original active controller: " + + ctrlr_link_down + "\n") + elif ctrlr_down: + if hb_loss: + data.append(start_time + " to " + end_time + " Uncontrolled swact due to " + "spontaneous reset of active controller " + ctrlr_down + "\n") + else: + data.append(start_time + " to " + end_time + " Uncontrolled swact likely due " + "to spontaneous reset of active controller " + ctrlr_down + "\n") + elif svc_failed: + if active_failed and go_active_failed: + data.append(start_time + " to " + end_time + " Uncontrolled swact due to " + "service failure (" + svc_failed + ") twice in 2 minutes was " + "unsuccessful so \"bounced back\" to original active controller " + + ctrlr_svc_fail + "\n") + elif active_failed: + data.append(start_time + " to " + end_time + " Uncontrolled swact due to " + "service failure (" + svc_failed + ") twice in 2 minutes on " + "active controller " + ctrlr_svc_fail + "\n") + else: + data.append(start_time + " to " + end_time + " Uncontrolled swact likely due " + "to service failure (" + svc_failed + ") twice in 2 minutes on " + "active controller " + ctrlr_svc_fail + "\n") + + start_time = end_time = svc_failed = ctrlr_down = ctrlr_svc_fail = ctrlr_link_down = None + hb_loss = active_failed = go_active_failed = link_down = False + + # Read next line + line = swact_activity.readline() + + # Close the output file from swact activity plugin + swact_activity.close() + + return data + + def mtc_errors(self): + """ + Searches through the output file created by the maintenance errors plugin + for failures and determines their causes through other indicators + + Errors: + FileNotFoundError + """ + data = [] + + # Variables to keep track of indicators for failure causes + goenable_start = goenable_end = goenable_host = None + goenable_tst_f = config_tst_f = None # Tests failed + config_start = config_end = config_host = puppet_error = None + hb_loss_start = hb_loss_end = hb_loss_host = None + daemon_fail = comm_loss = auto_recov_dis = False + + # Open output file from maintenance errors plugin and read it + file_path = os.path.join(self.plugin_output_dir, "maintenance_errors") + mtc = open(file_path, "r") + line = mtc.readline() + + while line: + if "auto recovery disabled" in line and not auto_recov_dis: + # Check if previous failure recorded was go-enable, configuration or heartbeat failure + if data and re.search(r"Go-enable|[cC]onfiguration|Heartbeat", data[-1]): + host = re.findall("failure on ([^\s]+)", data[-1]) + # Check if host in auto recovery disabled mode is same as host with previous failure + if host and re.search(host[0] + " auto recovery disabled", line): + old = data[-1].split("due", 1) + if len(old) == 1: + data[-1] = data[-1][:-1] + " (auto recovery disabled)\n" + else: + data[-1] = old[0] + "(auto recovery disabled) due" + old[1] + auto_recov_dis = True + elif "GOENABLED Failed" in line and not goenable_start: + goenable_start, auto_recov_dis = line[0:19], False + goenable_host = re.findall("Error : (.+) got GOENABLED Failed", line)[0] + elif "configuration failed or incomplete" in line and not config_start: + config_start = datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S") + auto_recov_dis = False + config_host = re.findall("Error : (.+) configuration failed", line)[0] + elif "Heartbeat Loss" in line: + # Check if previous failure recorded was heartbeat loss due to missing heartbeat messages + if ("(during recovery soak)" in line and data and + re.search("missing heartbeat messages", data[-1])): + host = re.findall("failure on (.+) due to", data[-1])[0] + # Check if host with hearbeat loss failure is the same as host with previous failure + if re.search(host + " (.+) Heartbeat Loss (.+) \(during recovery soak\)", line): + old = data[-1] + data[-1] = (old[0:23] + line[0:19] + old[42:-1] + + " (recovery over disabled due to heartbeat soak failure)\n") + else: + hb_loss_start, comm_loss, auto_recov_dis = line[0:19], False, False + hb_loss_host = re.findall("Error : (.+) [CM]", line)[0] + # Check if previous failure recorded was heartbeat loss due to missing heartbeat messages + elif ("regained MTCALIVE from host that has rebooted" in line and data and + re.search("Heartbeat loss failure (.+) \(recovery over disabled\)", data[-1])): + host = re.findall("failure on (.+) due to", data[-1])[0] + if re.search(host + " regained MTCALIVE", line): + old = data[-1].split("due", 1)[0] + data[-1] = old[0:23] + line[0:19] + old[42:] + "due to uncontrolled reboot\n" + elif (hb_loss_host and re.search(hb_loss_host + " Loss Of Communication for 5 seconds", line) + and hb_loss_start and not comm_loss): + comm_loss = True + elif re.search("mtcClient --- (.+)Error : FAILED:", line): + if goenable_start and not goenable_tst_f: + goenable_tst_f = re.findall("Error : FAILED: (.+) \(\d", line)[0] + elif config_start and not config_tst_f: + config_tst_f = re.findall("Error : FAILED: (.+) \(\d", line)[0] + elif (goenable_host and + re.search(goenable_host + " Task: In-Test Failure, threshold reached", line) and not + goenable_end): + goenable_end = line[0:19] + if goenable_tst_f: + data.append(goenable_start + " to " + goenable_end + " Go-enable test failure on " + + goenable_host + " due to failing of " + goenable_tst_f + "\n") + else: + data.append(goenable_start + " to " + goenable_end + " Go-enable test failure on " + + goenable_host + " due to unknown test failing\n") + + goenable_start = goenable_end = goenable_host = goenable_tst_f = None + elif (config_host and + re.search(config_host + " Task: Configuration failure, threshold reached", line) and not + config_end): + config_end = datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S") + if config_tst_f != "/etc/goenabled.d/config_goenabled_check.sh": + try: + daemon_fail = self.search_daemon_fail(config_start, config_end, config_host) + except FileNotFoundError as e: + logger.error(e) + + if config_tst_f == "/etc/goenabled.d/config_goenabled_check.sh" or daemon_fail: + try: + puppet_error = self.search_puppet_error(config_start, config_end) + except FileNotFoundError as e: + logger.error(e) + + config_start = config_start.strftime("%Y-%m-%dT%H:%M:%S") + config_end = config_end.strftime("%Y-%m-%dT%H:%M:%S") + if puppet_error: + data.append(config_start + " to " + config_end + " Configuration failure on " + + config_host + " due to:\n" + puppet_error) + else: + data.append(config_start + " to " + config_end + " Configuration failure on " + + config_host + " due to unknown cause\n") + else: + config_start = config_start.strftime("%Y-%m-%dT%H:%M:%S") + config_end = config_end.strftime("%Y-%m-%dT%H:%M:%S") + data.append(config_start + " to " + config_end + " Possible configuration failure" + " on " + config_host + "\n") + + config_start = config_end = config_host = config_tst_f = puppet_error = None + daemon_fail = False + elif (hb_loss_host and re.search(hb_loss_host + " Connectivity Recovered ", line) and + hb_loss_start and not hb_loss_end): + hb_loss_end = line[0:19] + data.append(hb_loss_start + " to " + hb_loss_end + " Heartbeat loss failure on " + + hb_loss_host + " due to too many missing heartbeat messages\n") + + hb_loss_start = hb_loss_end = hb_loss_host = None + comm_loss = False + elif (hb_loss_host and re.search(hb_loss_host + " Graceful Recovery Wait", line) and + hb_loss_start and comm_loss and not hb_loss_end): + hb_loss_end = line[0:19] + data.append(hb_loss_start + " to " + hb_loss_end + " Heartbeat loss failure on " + + hb_loss_host + " due to too many missing heartbeat messages (recovery over disabled)\n") + + hb_loss_start = hb_loss_end = hb_loss_host = None + comm_loss = False + + # Read next line + line = mtc.readline() + + # Close the output file from maintenance errors plugin + mtc.close() + + return data + + def search_hb_loss(self, start_time, end_time, host): + """ + Searches through the output file created by the heartbeat loss plugin for heartbeat loss + message from host between one minute before start_time and end_time + + Errors: + FileNotFoundError + """ + hb_loss = False + + # Open output file from heartbeat loss plugin and read it + file_path = os.path.join(self.plugin_output_dir, "heartbeat_loss") + heartbeat_loss = open(file_path, "r") + line = heartbeat_loss.readline() + + while line: + if re.search("Error : " + host + " (.+) Heartbeat Loss ", line): + date = datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S") + if date >= start_time - timedelta(minutes=1) and date <= end_time: + hb_loss = True + break + + # Read next line + line = heartbeat_loss.readline() + + # Close the output file from heartbeat loss plugin + heartbeat_loss.close() + + return hb_loss + + def search_daemon_fail(self, start_time, end_time, host): + """ + Searches through the output file created by the daemon failures plugin for puppet manifest + failed message from host between 10 seconds before start_time and end_time + + Errors: + FileNotFoundError + """ + daemon_fail = False + + # Open output file from daemon failures plugin and read it + file_path = os.path.join(self.plugin_output_dir, "daemon_failures") + daemon_failures = open(file_path, "r") + line = daemon_failures.readline() + + while line: + if re.search("\d " + host + " (.+) Failed to run the puppet manifest", line): + date = datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S") + if date >= start_time - timedelta(seconds=10) and date <= end_time: + daemon_fail = True + break + + # Read next line + line = daemon_failures.readline() + + # Close the output file from daemon failures plugin + daemon_failures.close() + + return daemon_fail + + + def search_puppet_error(self, start_time, end_time): + """ + Searches through the output file created by the puppet errors plugin for error message + between 10 seconds before start_time and end_time and returns it + + Errors: + FileNotFoundError + """ + puppet_log = None + + # Open output file from puppet errors plugin and read it + file_path = os.path.join(self.plugin_output_dir, "puppet_errors") + puppet_errors = open(file_path, "r") + line = puppet_errors.readline() + + while line: + if "Error: " in line: + date = datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S") + if date >= start_time - timedelta(seconds=10) and date <= end_time: + puppet_log = line + break + + # Read next line + line = puppet_errors.readline() + + # Close the output file from puppet errors plugin + puppet_errors.close() + + return puppet_log + + + def get_events(self, hostname): + """ + Searches through the output files created by the plugins for significant events + and summarizes them + + Errors: + FileNotFoundError + """ + data = [] + + # Open output file from maintenance errors plugin and read it + file_path = os.path.join(self.plugin_output_dir, "maintenance_errors") + mtc = open(file_path, "r") + line = mtc.readline() + + while line: + if "force failed by SM" in line: + host = re.findall("Error : (.+) is being", line)[0] + if hostname == "all" or host == hostname: + data.append(line[0:19] + " " + host + " force failed by SM\n") + elif "Graceful Recovery Failed" in line: + host = re.findall("Info : (.+) Task:", line)[0] + if hostname == "all" or host == hostname: + data.append(line[0:19] + " " + host + " graceful recovery failed\n") + + # Read next line + line = mtc.readline() + + # Close the output file from maintenance errors plugin + mtc.close() + + # Open output file from swact activity plugin and read it + file_path = os.path.join(self.plugin_output_dir, "swact_activity") + swact_activity = open(file_path, "r") + line = swact_activity.readline() + + while line: + if re.search("Service (.+) is failed and has reached max failures", line): + host = re.findall("\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3} (.+) sm:", line)[0] + svc_failed = re.findall("Service \((.+)\) is failed", line)[0] + if hostname == "all" or host == hostname: + data.append(line[0:19] + " " + host + " service failure (" + svc_failed + ")\n") + + # Read next line + line = swact_activity.readline() + + # Close the output file from swact activity plugin + swact_activity.close() + + return data + + def get_state_changes(self, hostname): + """ + Searches through the output files created by the state changes plugin and + summarizes the changes of state of the hosts + + Errors: + FileNotFoundError + """ + data = [] + + # Open output file from state changes plugin and read it + file_path = os.path.join(self.plugin_output_dir, "state_changes") + state_changes = open(file_path, "r") + line = state_changes.readline() + + while line: + if "is ENABLED" in line: + host = re.findall("Info : (.+) is ENABLED", line)[0] + state = re.findall("is (.+)\n", line)[0].lower() + if hostname == "all" or hostname in host: + data.append(line[0:19] + " " + host + " " + state + "\n") + elif "locked-disabled" in line: + host = re.findall("Info : (.+) u?n?locked-disabled", line)[0] + if hostname == "all" or host == hostname: + data.append(line[0:19] + " " + host + " disabled\n") + + # Read next line + line = state_changes.readline() + + # Close the output file from state changes plugin + state_changes.close() + + return data diff --git a/tools/collector/report/execution_engine.py b/tools/collector/report/execution_engine.py index 3e12abc4..5c36e4d5 100755 --- a/tools/collector/report/execution_engine.py +++ b/tools/collector/report/execution_engine.py @@ -21,15 +21,17 @@ import os import re import shutil import subprocess +import tarfile import algorithms +from correlator import Correlator logger = logging.getLogger(__name__) class ExecutionEngine: - def __init__(self, opts): + def __init__(self, opts, output_directory): """Constructor for the ExecutionEngine class Parameters: @@ -39,6 +41,19 @@ class ExecutionEngine: self.hosts = {"controllers": {}, "workers": {}, "storages": {}} self.active_controller_directory = None + # Uncompresses host tar files if not already done + with open(os.path.join(output_directory, "untar.log"), "a") as log_file: + for obj in (os.scandir(self.opts.directory)): + info = os.path.splitext(obj.name) + if (obj.is_file() and obj.name != "report_tool.tgz" and tarfile.is_tarfile(obj.path) + and not os.path.isdir(os.path.join(self.opts.directory, info[0]))): + try: + subprocess.run(["tar", "xzfC", obj.path, self.opts.directory], + stderr=log_file, check=True) + subprocess.run(["echo","uncompressed", obj.name], check=True) + except subprocess.CalledProcessError as e: + logger.error(e) + for folder in (f.path for f in os.scandir(self.opts.directory)): database_path = os.path.join(folder, "var", "extra", "database") host_info_path = os.path.join(folder, "var", "extra", "host.info") @@ -67,13 +82,17 @@ class ExecutionEngine: Errors: FileNotFoundError """ + plugin_output_dir = os.path.join(output_directory, "plugins") + os.makedirs(plugin_output_dir, exist_ok=True) for plugin in plugins: - logger.info(f"Processing plugin: {os.path.basename(plugin.file)}") + processing = "Processing plugin: " + os.path.basename(plugin.file) hosts = {} if ( plugin.state["hosts"] and len(plugin.state["hosts"]) >= 1 ): # if host list is given + logger.info(f"Processing plugin: {os.path.basename(plugin.file)}") + for h in plugin.state["hosts"]: if h == "all": hosts.update(self.hosts["workers"]) @@ -86,24 +105,21 @@ class ExecutionEngine: events = [] if plugin.state["algorithm"] == algorithms.SUBSTRING: - try: - events = self.substring( - plugin.state["substring"], - [ - os.path.join(folderpath, file) - for file in plugin.state["files"] - ], - ) - except FileNotFoundError as e: - logger.error(e) - continue + events = self.substring( + plugin.state["substring"], + [ + os.path.join(folderpath, file) + for file in plugin.state["files"] + ], + ) # creating output file output_file = os.path.join( - output_directory, - f"{hostname}_{os.path.basename(plugin.file)}_{plugin.state['algorithm']}", + plugin_output_dir, + f"substring_{hostname}", ) - logger.info("output at " + output_file) + if self.opts.verbose: + logger.info("output at " + os.path.relpath(output_file)) with open(output_file, "w") as file: file.write( f"Date range: {self.opts.start} until {self.opts.end}\n" @@ -112,18 +128,24 @@ class ExecutionEngine: f"substrings: {' '.join(plugin.state['substring'])}\n" ) for line in events: - file.write(line + "\n") + if line[-1] == "\n": + file.write(line) + else: + file.write(line + "\n") else: if plugin.state["algorithm"] == algorithms.SYSTEM_INFO: info = self.system_info() - system_info_output = os.path.join(output_directory, "system_info") + system_info_output = os.path.join(plugin_output_dir, "system_info") with open(system_info_output, "w") as file: for i in info: file.write(i + "\n") for k, v in self.hosts.items(): file.write(f"{k}: {','.join(v.keys())}\n") - logger.info("output at " + system_info_output) + if self.opts.verbose: + logger.info(processing + ", output at " + os.path.relpath(system_info_output)) + else: + logger.info(processing) elif plugin.state["algorithm"] == algorithms.AUDIT: hosts = {} @@ -134,7 +156,7 @@ class ExecutionEngine: for hostname, folderpath in hosts.items(): self._create_output_file( f"{hostname}_audit", - output_directory, + plugin_output_dir, self.audit( plugin.state["start"], plugin.state["end"], @@ -142,30 +164,30 @@ class ExecutionEngine: folderpath, "var", "log", "dcmanager", "audit.log" ), ), + processing, ) - elif plugin.state["algorithm"] == algorithms.SWACT: + elif plugin.state["algorithm"] == algorithms.SWACT_ACTIVITY: self._create_output_file( - "swact_activity", output_directory, self.swact() + "swact_activity", plugin_output_dir, self.swact_activity(), processing ) - elif plugin.state["algorithm"] == algorithms.PUPPET: + elif plugin.state["algorithm"] == algorithms.PUPPET_ERRORS: self._create_output_file( - "puppet_errors", output_directory, self.puppet() + "puppet_errors", plugin_output_dir, self.puppet_errors(), processing ) - elif plugin.state["algorithm"] == algorithms.PROCESS_FAILURE: + elif plugin.state["algorithm"] == algorithms.PROCESS_FAILURES: self._create_output_file( - "process_failures", output_directory, self.process_failure() + "process_failures", plugin_output_dir, self.process_failures(), processing ) elif plugin.state["algorithm"] == algorithms.ALARM: alarms, logs = self.alarm( plugin.state["alarm_ids"], plugin.state["entity_ids"] ) - alarm_output = os.path.join(output_directory, "alarm") - log_output = os.path.join(output_directory, "log") - os.makedirs(os.path.dirname(log_output), exist_ok=True) + alarm_output = os.path.join(plugin_output_dir, "alarm") + log_output = os.path.join(plugin_output_dir, "log") # creating output alarm file with open(alarm_output, "w") as file: @@ -186,8 +208,34 @@ class ExecutionEngine: file.write(f"{k}\n") for date in v["dates"]: file.write(f" {date}\n") - logger.info("output at " + alarm_output) - logger.info("output at " + log_output) + if self.opts.verbose: + logger.info(processing + ", output at " + os.path.relpath(alarm_output) + + ", " + os.path.relpath(log_output)) + else: + logger.info(processing) + elif plugin.state["algorithm"] == algorithms.HEARTBEAT_LOSS: + self._create_output_file( + "heartbeat_loss", plugin_output_dir, self.heartbeat_loss(), processing + ) + elif plugin.state["algorithm"] == algorithms.MAINTENANCE_ERR: + self._create_output_file( + "maintenance_errors", plugin_output_dir, self.maintenance_errors(), processing + ) + elif plugin.state["algorithm"] == algorithms.DAEMON_FAILURES: + self._create_output_file( + "daemon_failures", plugin_output_dir, self.daemon_failures(), processing + ) + elif plugin.state["algorithm"] == algorithms.STATE_CHANGES: + self._create_output_file( + "state_changes", plugin_output_dir, self.state_changes(), processing + ) + + if not self.opts.verbose: + logger.info("Output files for plugins can be found at " + + os.path.relpath(plugin_output_dir)) + + # Running the correlator and printing the output from it + self.run_correlator(output_directory, plugin_output_dir) # Built-in algorithms ------------------------------ def alarm(self, alarm_ids=[], entity_ids=[]): @@ -299,32 +347,51 @@ class ExecutionEngine: data = [] for file in files: - if not os.path.exists(file): - raise FileNotFoundError(f"File not found: {file}") - cont = True - # Searching through file - command = f"""grep -Ea "{'|'.join(s for s in substr)}" {file}""" - status = self._continue(file) + try: + if not os.path.exists(file): + if re.search("controller-1_(.+)/var/log/mtcAgent.log", file): + continue + raise FileNotFoundError(f"File not found: {file}") + cont = True + # Searching through file + command = f"""grep -Ea "{'|'.join(s for s in substr)}" {file} 2>/dev/null""" + status = self._continue(file) - if ( - status == CONTINUE_CURRENT or status == CONTINUE_CURRENT_OLD - ): # continue with current file - if status == CONTINUE_CURRENT: - cont = False - self._evaluate_substring(data, command) - - # Searching through rotated log files - n = 1 - while os.path.exists(f"{file}.{n}.gz") and cont: - command = f"""zgrep -E "{'|'.join(s for s in substr)}" {file}.{n}.gz""" - status = self._continue(f"{file}.{n}.gz", compressed=True) - - if status == CONTINUE_CURRENT or status == CONTINUE_CURRENT_OLD: + if ( + status == CONTINUE_CURRENT or status == CONTINUE_CURRENT_OLD + ): # continue with current file if status == CONTINUE_CURRENT: cont = False self._evaluate_substring(data, command) - n += 1 + # Searching through rotated log files that have not been compressed + n = 1 + while os.path.exists(f"{file}.{n}") and cont: + command = f"""grep -Ea "{'|'.join(s for s in substr)}" {file}.{n} 2>/dev/null""" + status = self._continue(f"{file}.{n}") + + if status == CONTINUE_CURRENT or status == CONTINUE_CURRENT_OLD: + if status == CONTINUE_CURRENT: + cont = False + self._evaluate_substring(data, command) + + n += 1 + + # Searching through rotated log files + while os.path.exists(f"{file}.{n}.gz") and cont: + command = f"""zgrep -E "{'|'.join(s for s in substr)}" {file}.{n}.gz 2>/dev/null""" + status = self._continue(f"{file}.{n}.gz", compressed=True) + + if status == CONTINUE_CURRENT or status == CONTINUE_CURRENT_OLD: + if status == CONTINUE_CURRENT: + cont = False + self._evaluate_substring(data, command) + + n += 1 + + except FileNotFoundError as e: + logger.error(e) + continue return sorted(data) @@ -368,7 +435,7 @@ class ExecutionEngine: return data - def swact(self): + def swact_activity(self): """Swact activity algorithm Presents all swacting activity in the system """ @@ -382,8 +449,12 @@ class ExecutionEngine: for _, folder in self.hosts["controllers"].items(): sm_path = os.path.join(folder, "var", "log", "sm.log") sm_files.append(sm_path) + sm_customer_path = os.path.join(folder, "var", "log", "sm-customer.log") + sm_customer_files.append(sm_customer_path) - sm_substrings = ["Swact has started,", "Swact update"] + sm_substrings = ["Uncontrolled swact", "Swact has started,", + "Neighbor (.+) is now in the down", + "Service (.+) has reached max failures", "Swact update"] data = self.substring(sm_substrings, sm_files) for i, line in enumerate(data): @@ -396,28 +467,25 @@ class ExecutionEngine: line += f" SWACT TOOK {swact_end - swact_start} \n" data[i] = line - for _, folder in self.hosts["controllers"].items(): - sm_customer_path = os.path.join(folder, "var", "log", "sm-customer.log") - sm_customer_files.append(sm_customer_path) - - sm_customer_substrings = ["swact"] + sm_customer_substrings = ["swact", "active-failed\s+\| disabling-failed\s+\|"] data += self.substring(sm_customer_substrings, sm_customer_files) return sorted(data) - def puppet(self): - """Puppet error algorithm + def puppet_errors(self): + """Puppet errors algorithm Presents log errors from puppet logs """ data = [] - for _, folder in self.hosts["controllers"].items(): - puppet_folder = os.path.join(folder, "var", "log", "puppet") - command = f"grep -rh 'Error:' {puppet_folder}" - self._evaluate_substring(data, command) + for host_type in self.hosts.keys(): + for _, folder in self.hosts[host_type].items(): + puppet_folder = os.path.join(folder, "var", "log", "puppet") + command = f"""grep -rh "[m ]Error: " {puppet_folder} 2>/dev/null""" + self._evaluate_substring(data, command) return sorted(data) - def process_failure(self): - """Process failure algorithm + def process_failures(self): + """Process failures algorithm Presents log errors from pmond """ data = [] @@ -426,9 +494,85 @@ class ExecutionEngine: for _, folder in self.hosts[host_type].items(): pmond = os.path.join(folder, "var", "log", "pmond.log") files.append(pmond) + data = self.substring(["Error :"], files) + return data + def heartbeat_loss(self): + """Heartbeat loss algorithm + Presents all heartbeat loss error messages in the system + """ + data = [] + hb_files = [] + + for _, folder in self.hosts["controllers"].items(): + hb_path = os.path.join(folder, "var", "log", "hbsAgent.log") + hb_files.append(hb_path) + + hb_substrings = ["Heartbeat Loss"] + data = self.substring(hb_substrings, hb_files) + + return sorted(data) + + def maintenance_errors(self): + """Maintenance errors algorithm + Presents maintenance errors and other relevant log messages in the system + """ + data = [] + mtc_files = [] + + for _, folder in self.hosts["controllers"].items(): + agent = os.path.join(folder, "var", "log", "mtcAgent.log") + mtc_files.append(agent) + + for host_type in self.hosts.keys(): + for _, folder in self.hosts[host_type].items(): + client = os.path.join(folder, "var", "log", "mtcClient.log") + mtc_files.append(client) + + mtc_substrings = ["Error : ", "Configuration failure", "In-Test Failure", + "Loss Of Communication", "Graceful Recovery Wait ", + "regained MTCALIVE from host that has rebooted", + "Connectivity Recovered ; ", "auto recovery disabled", + "Graceful Recovery Failed"] + data = self.substring(mtc_substrings, mtc_files) + + return sorted(data) + + def daemon_failures(self): + """Daemon failures algorithm + Presents all failed puppet manifest messages in the system + """ + data = [] + daemon_files = [] + + for host_type in self.hosts.keys(): + for _, folder in self.hosts[host_type].items(): + daemon_path = os.path.join(folder, "var", "log", "daemon.log") + daemon_files.append(daemon_path) + + daemon_substrings = ["Failed to run the puppet manifest"] + data = self.substring(daemon_substrings, daemon_files) + + return sorted(data) + + def state_changes(self): + """State changes algorithm + Presents all messages in the system regarding the state of hosts + """ + data = [] + sc_files = [] + + for _, folder in self.hosts["controllers"].items(): + sc_path = os.path.join(folder, "var", "log", "mtcAgent.log") + sc_files.append(sc_path) + + sc_substrings = ["is ENABLED", "allStateChange (.+)locked-disabled"] + data = self.substring(sc_substrings, sc_files) + + return sorted(data) + def audit(self, start, end, audit_log_path): """Counts audit events in dcmanager within a specified date range @@ -480,6 +624,85 @@ class ExecutionEngine: # ----------------------------------- + def run_correlator(self, output_directory, plugin_output_dir): + """Runs the correlator and prints the results differently based on if the tool was run with or + without the verbose option + + Parameters: + output_directory (string) : directory to place output files from correlator + plugin_output_dir (string) : directory with output files from plugins + """ + correlator = Correlator(plugin_output_dir) + failures, events, state_changes = correlator.run(self.opts.hostname) + failures_len, events_len, state_changes_len = len(failures), len(events), len(state_changes) + failures.append("\nTotal failures found: " + str(failures_len) + "\n") + events.append("\nTotal events found: " + str(events_len) + "\n") + state_changes.append("\nTotal state changes found: " + str(state_changes_len) + "\n") + + logger.info("\nRunning correlator...") + self._create_output_file("correlator_failures", output_directory, failures, "") + self._create_output_file("correlator_events", output_directory, events, "") + self._create_output_file("correlator_state_changes", output_directory, state_changes, "") + + if not self.opts.verbose: + logger.info("Output can be found at " + os.path.relpath(output_directory) + "\n") + logger.info("Failures: " + str(failures_len)) + for f in failures[:-1]: + if "Uncontrolled swact" in f: + logger.info(f[0:19] + " " + re.findall("active controller:? (.+)\n", f)[0] + + " uncontrolled swact") + elif "failure on" in f: + host = re.findall("failure on ([^\s]+) ", f)[0] + logger.info(f[0:19] + " " + host + " " + + re.findall("^(.+) failure on ", f[43:])[0].lower() + " failure") + else: + logger.info(f[:-1]) + if failures_len != 0: + logger.info("\nEvents: " + str(events_len)) + else: + logger.info("Events: " + str(events_len)) + logger.info("State Changes: " + str(state_changes_len)) + else: + logger.info("\nFailures: " + str(failures_len)) + for f in failures[:-1]: + logger.info(f[:-1]) + + # Dictionary to keep track of number of times events happens on each host + events_summ = {} + for e in events[:-1]: + k = e[20:-1] + if "service failure" in k: + k = k.split(" (", 1)[0] + if not events_summ.get(k): + events_summ[k] = 1 + else: + events_summ[k] += 1 + + if failures_len != 0: + logger.info("\nEvents: " + str(events_len)) + else: + logger.info("Events: " + str(events_len)) + for k, v in sorted(events_summ.items()): + logger.info(k + ": " + str(v) + " time(s)") + + # Dictionary to keep track of number of times state changes happens on each host + state_changes_summ = {} + for s in state_changes[:-1]: + k = s[20:-1] + if "enabled" in k: + k = k.split("enabled", 1)[0] + "enabled" + if not state_changes_summ.get(k): + state_changes_summ[k] = 1 + else: + state_changes_summ[k] += 1 + + if events_len != 0: + logger.info("\nState Changes: " + str(state_changes_len)) + else: + logger.info("State Changes: " + str(state_changes_len)) + for k, v in sorted(state_changes_summ.items()): + logger.info(k + ": " + str(v) + " time(s)") + def _continue(self, file, compressed=False): CONTINUE_CURRENT = 0 # don't analyze older files, continue with current file CONTINUE_CURRENT_OLD = 1 # analyze older files, continue with current file @@ -519,7 +742,7 @@ class ExecutionEngine: datetime.strptime(date, "%Y-%m-%dT%H:%M:%S") if date > self.opts.start and date < self.opts.end: if line[0] == "|": # sm-customer.log edge case - line = line.replace("|", "").strip() + line = line[1:].strip() line = re.sub("\s+", " ", line) data.append(line) break @@ -531,16 +754,26 @@ class ExecutionEngine: GROUP_ONE = 1 with open(host_info_path) as file: for line in file: - hostname_match = re.match("^hostname => (.+)", line) - subfunction_match = re.match("^subfunction => (.+)", line) + hostname_match = re.match(r"\s*hostname =>\s*\"?([^\"]*)(\n|\"\s*,?\s*\n)", line) + subfunction_match = re.match(r"\s*subfunction =>\s*\"?([^\"]*)(\n|\"\s*,?\s*\n)", line) if subfunction_match: subfunction = subfunction_match.group(GROUP_ONE) if hostname_match: hostname = hostname_match.group(GROUP_ONE) return hostname, subfunction - def _create_output_file(self, filename, directory, events): + def _create_output_file(self, filename, directory, data, processing): with open(os.path.join(directory, filename), "w") as file: - for i in events: - file.write(i + "\n") - logger.info("output at " + os.path.join(directory, filename)) + for i in data: + if i[-1] == "\n": + file.write(i) + else: + file.write(i + "\n") + if self.opts.verbose: + output = "output at " + os.path.relpath(os.path.join(directory, filename)) + if processing == "": + logger.info(output) + else: + logger.info(processing + ", " + output) + elif processing != "": + logger.info(processing) diff --git a/tools/collector/report/plugin.py b/tools/collector/report/plugin.py index 1ecf40e0..9ed6ca0b 100755 --- a/tools/collector/report/plugin.py +++ b/tools/collector/report/plugin.py @@ -141,17 +141,37 @@ class Plugin: raise ValueError( f"plugin: {plugin_name} should not have hosts to be specified" ) - elif self.state["algorithm"] == algorithms.SWACT: + elif self.state["algorithm"] == algorithms.SWACT_ACTIVITY: if len(self.state["hosts"]) > 0: raise ValueError( f"plugin: {plugin_name} should not have hosts to be specified" ) - elif self.state["algorithm"] == algorithms.PUPPET: + elif self.state["algorithm"] == algorithms.PUPPET_ERRORS: if len(self.state["hosts"]) > 0: raise ValueError( f"plugin: {plugin_name} should not have hosts to be specified" ) - elif self.state["algorithm"] == algorithms.PROCESS_FAILURE: + elif self.state["algorithm"] == algorithms.PROCESS_FAILURES: + if len(self.state["hosts"]) > 0: + raise ValueError( + f"plugin: {plugin_name} should not have hosts to be specified" + ) + elif self.state["algorithm"] == algorithms.HEARTBEAT_LOSS: + if len(self.state["hosts"]) > 0: + raise ValueError( + f"plugin: {plugin_name} should not have hosts to be specified" + ) + elif self.state["algorithm"] == algorithms.MAINTENANCE_ERR: + if len(self.state["hosts"]) > 0: + raise ValueError( + f"plugin: {plugin_name} should not have hosts to be specified" + ) + elif self.state["algorithm"] == algorithms.DAEMON_FAILURES: + if len(self.state["hosts"]) > 0: + raise ValueError( + f"plugin: {plugin_name} should not have hosts to be specified" + ) + elif self.state["algorithm"] == algorithms.STATE_CHANGES: if len(self.state["hosts"]) > 0: raise ValueError( f"plugin: {plugin_name} should not have hosts to be specified" diff --git a/tools/collector/report/plugins/alarm b/tools/collector/report/plugins/alarm new file mode 100755 index 00000000..8b77a6fd --- /dev/null +++ b/tools/collector/report/plugins/alarm @@ -0,0 +1,3 @@ +algorithm=alarm +alarm_ids=400., 401. +entity_ids=host=controller-0, host=controller-1 diff --git a/tools/collector/report/plugins/daemon_failures b/tools/collector/report/plugins/daemon_failures new file mode 100755 index 00000000..4d5f7404 --- /dev/null +++ b/tools/collector/report/plugins/daemon_failures @@ -0,0 +1 @@ +algorithm=daemon_failures diff --git a/tools/collector/report/plugins/heartbeat_loss b/tools/collector/report/plugins/heartbeat_loss new file mode 100755 index 00000000..2e5b13be --- /dev/null +++ b/tools/collector/report/plugins/heartbeat_loss @@ -0,0 +1 @@ +algorithm=heartbeat_loss diff --git a/tools/collector/report/plugins/maintenance_errors b/tools/collector/report/plugins/maintenance_errors new file mode 100755 index 00000000..1f5bc144 --- /dev/null +++ b/tools/collector/report/plugins/maintenance_errors @@ -0,0 +1 @@ +algorithm=maintenance_errors diff --git a/tools/collector/report/plugins/process_failures b/tools/collector/report/plugins/process_failures new file mode 100755 index 00000000..1dcdf8a8 --- /dev/null +++ b/tools/collector/report/plugins/process_failures @@ -0,0 +1 @@ +algorithm=process_failures diff --git a/tools/collector/report/plugins/puppet_errors b/tools/collector/report/plugins/puppet_errors new file mode 100755 index 00000000..0eb6f0e4 --- /dev/null +++ b/tools/collector/report/plugins/puppet_errors @@ -0,0 +1 @@ +algorithm=puppet_errors diff --git a/tools/collector/report/plugins/state_changes b/tools/collector/report/plugins/state_changes new file mode 100644 index 00000000..7d3fd2a1 --- /dev/null +++ b/tools/collector/report/plugins/state_changes @@ -0,0 +1 @@ +algorithm=state_changes diff --git a/tools/collector/report/plugins/substring b/tools/collector/report/plugins/substring new file mode 100755 index 00000000..bae11679 --- /dev/null +++ b/tools/collector/report/plugins/substring @@ -0,0 +1,5 @@ +algorithm=substring +files=var/log/mtcAgent.log, var/log/sm.log +hosts=controllers +substring=operation failed +substring=Failed to send message diff --git a/tools/collector/report/plugins/swact_activity b/tools/collector/report/plugins/swact_activity new file mode 100755 index 00000000..215570ae --- /dev/null +++ b/tools/collector/report/plugins/swact_activity @@ -0,0 +1 @@ +algorithm=swact_activity diff --git a/tools/collector/report/plugins/system_info b/tools/collector/report/plugins/system_info new file mode 100755 index 00000000..0736b730 --- /dev/null +++ b/tools/collector/report/plugins/system_info @@ -0,0 +1 @@ +algorithm=system_info diff --git a/tools/collector/report/report.py b/tools/collector/report/report.py index 520771cf..f08a8e5e 100755 --- a/tools/collector/report/report.py +++ b/tools/collector/report/report.py @@ -17,9 +17,9 @@ # The report tool requires the collect bundle and host tarballs to be # untarred. # -# The report tool reads user plugins from a plugins directory in the -# top level of the collect bundle, and outputs files containing -# relevant logs to a report directory in the top level as well. +# The report tool reads user plugins from the report directory in the +# top level of the collect bundle, and outputs files containing files +# containing relevant logs to this directory as well. # # Typical Usage: # command line functionality @@ -39,24 +39,35 @@ import argparse from cmath import log from datetime import datetime +from datetime import timedelta from datetime import timezone import logging import os import time +import subprocess +import sys from execution_engine import ExecutionEngine from plugin import Plugin now = datetime.now(timezone.utc) -base_dir = os.path.realpath(__file__) -default_path = os.path.join(os.path.dirname(base_dir), "..", "..") +base_dir = os.path.dirname(os.path.realpath(__file__)) +parent_dir = os.path.dirname(base_dir) +default_path = os.path.dirname(parent_dir) plugins = [] parser = argparse.ArgumentParser( description="Log Event Reporter", - epilog="Place plugins in 'plugins' directory at top level of collect bundle. Output files will be placed in 'report' directory." - "\nThis tool will create a report.log file along with other output files", + epilog="Place plugins in 'plugins' directory found in 'report' directory at top level of collect bundle. Output files" + "\nwill be placed in 'report' directory." + "\nThis tool will create a report.log and untar.log file along with other output files.", +) +parser.add_argument( + "-v", + "--verbose", + action="store_true", + help="Verbose output", ) parser.add_argument( "-s", @@ -67,7 +78,7 @@ parser.add_argument( parser.add_argument( "-e", "--end", - default=datetime.strftime(now, "%Y%m%d"), + default=datetime.strftime(now + timedelta(days=1), "%Y%m%d"), help="Specify an end date in YYYYMMDD format for analysis (default: current date)", ) parser.add_argument( @@ -81,7 +92,12 @@ parser.add_argument( "-d", "--directory", default=default_path, - help="Specify top level of collect bundle to analyze (default: two levels above current location)", + help="Specify top level of collect bundle to analyze (default: two levels above tool directory)", +) +parser.add_argument( + "--hostname", + default="all", + help="Specify host for correlator to find significant events and state changes for (default: all hosts)", ) subparsers = parser.add_subparsers(help="algorithms", dest="algorithm") @@ -93,10 +109,10 @@ parser_substring = subparsers.add_parser( There will be an output file for each host of the host type specified.""", epilog="Plugin file example:\n" " algorithm=substring\n" - " files=mtcAgent.log, sm.log\n" - " hosts=controllers, workers\n" - " substring=Swact in progress\n" - " substring=Swact update", + " files=var/log/mtcAgent.log, var/log/sm.log\n" + " hosts=controllers\n" + " substring=operation failed\n" + " substring=Failed to send message", ) substring_required = parser_substring.add_argument_group("required arguments") substring_required.add_argument( @@ -124,8 +140,8 @@ parser_alarm = subparsers.add_parser( help="Searches through fm.db.sql.txt for alarms and logs. There are 2 output files: 'alarm', and 'log'", epilog="Plugin file example:\n" " algorithm=alarm\n" - " alarm_ids=400.005,200.004\n" - " entity_ids= host=controller-0,host=controller-1\n", + " alarm_ids=400.005, 200.004\n" + " entity_ids=host=controller-0, host=controller-1\n", ) parser_alarm.add_argument( "--alarm_ids", @@ -151,27 +167,59 @@ parser_system_info = subparsers.add_parser( ) # swact activity algorithm -parser_swact = subparsers.add_parser( - "swact", +parser_swact_activity = subparsers.add_parser( + "swact_activity", formatter_class=argparse.RawTextHelpFormatter, help="Presents system swacting activity", - epilog="Plugin file example:\n" " algorithm=swact\n", + epilog="Plugin file example:\n" " algorithm=swact_activity\n", ) # puppet errors algorithm -parser_puppet = subparsers.add_parser( - "puppet", +parser_puppet_errors = subparsers.add_parser( + "puppet_errors", formatter_class=argparse.RawTextHelpFormatter, help="Presents any puppet errors", - epilog="Plugin file example:\n" " algorithm=puppet\n", + epilog="Plugin file example:\n" " algorithm=puppet_errors\n", ) -# process failure algorithm -parser_process_failure = subparsers.add_parser( - "process_failure", +# process failures algorithm +parser_process_failures = subparsers.add_parser( + "process_failures", formatter_class=argparse.RawTextHelpFormatter, help="Presents any process failures from pmond.log", - epilog="Plugin file example:\n" " algorithm=process_failure\n", + epilog="Plugin file example:\n" " algorithm=process_failures\n", +) + +# daemon failures algorithm +parser_daemon_failures = subparsers.add_parser( + "daemon_failures", + formatter_class=argparse.RawTextHelpFormatter, + help="Presents any puppet manifest failures from daemon.log", + epilog="Plugin file example:\n" " algorithm=daemon_failures\n", +) + +# heartbeat loss algorithm +parser_heartbeat_loss = subparsers.add_parser( + "heartbeat_loss", + formatter_class=argparse.RawTextHelpFormatter, + help="Presents any heartbeat loss error messages from hbsAgent.log", + epilog="Plugin file example:\n" " algorithm=heartbeat_loss\n", +) + +# maintenance errors algorithm +parser_maintenance_errors = subparsers.add_parser( + "maintenance_errors", + formatter_class=argparse.RawTextHelpFormatter, + help="Presents errors and other relevant messages from mtcAgent.log and mtcClient.log", + epilog="Plugin file example:\n" " algorithm=maintenance_errors\n", +) + +# state changes algorithm +parser_state_changes = subparsers.add_parser( + "state_changes", + formatter_class=argparse.RawTextHelpFormatter, + help="Presents any messages from mtcAgent.log regarding the state of hosts, such as enabled/disabled", + epilog="Plugin file example:\n" " algorithm=state_changes\n", ) # audit algorithm @@ -185,11 +233,19 @@ parser_audit = subparsers.add_parser( " start=2022-06-01 10:00:00\n" " end=2022-06-01 04:00:00\n", ) -parser_audit_required = parser_audit.add_argument_group("required arguments") -parser_audit_required.add_argument("--start", required=True) -parser_audit_required.add_argument( +parser_audit.add_argument( + "--start", + required=False, + default=datetime.strftime(now - timedelta(days = 7), "%Y-%m-%d %H:%M:%S"), + type=str, + help="Specify a start date in YYYY-MM-DD HH:MM:SS format for analysis (not required, default: 1 week ago)" +) +parser_audit.add_argument( "--end", - required=True, + required=False, + default=datetime.strftime(now, "%Y-%m-%d %H:%M:%S"), + type=str, + help="Specify an end date in YYYY-MM-DD HH:MM:SS format for analysis (not required, default: today)" ) @@ -197,12 +253,17 @@ args = parser.parse_args() args.start = datetime.strptime(args.start, "%Y%m%d").strftime("%Y-%m-%dT%H:%M:%S") args.end = datetime.strptime(args.end, "%Y%m%d").strftime("%Y-%m-%dT%H:%M:%S") -output_directory = os.path.join( - args.directory, "report", "output", now.strftime("%Y%m%d.%H%M%S") -) +if args.directory.endswith("/"): + output_directory = os.path.join( + default_path, "report", "output", os.path.basename(os.path.dirname(args.directory)) + ) +else: + output_directory = os.path.join( + default_path, "report", "output", os.path.basename(args.directory) + ) # creating report log -os.makedirs(output_directory) +os.makedirs(output_directory, exist_ok=True) open(os.path.join(output_directory, "report.log"), "w").close() # setting up logger @@ -223,17 +284,38 @@ ch.setFormatter(formatter) logger.addHandler(ch) +if not os.path.isdir(args.directory): + sys.exit("Top level of collect bundle given to analyze is not a directory") +else: + for obj in (os.scandir(args.directory)): + info = os.path.splitext(obj.name) + + # TODO: ask user which file to report on if more than one tarball in directory + # Check if collect tarball is in given directory and extracts it if not already done + if (obj.is_file() and info[1] == ".tar"): + try: + result = subprocess.check_output(["tar", "tf", obj.path], encoding="UTF-8") + result = result.split("\n", 1) + if not os.path.isdir(os.path.join(args.directory, os.path.dirname(result[0]))): + subprocess.run(["tar", "xfC", obj.path, args.directory], check=True) + subprocess.run(["echo","extracted", obj.name], check=True) + args.directory = os.path.join(args.directory, os.path.dirname(result[0])) + break + except subprocess.CalledProcessError as e: + logger.error(e) + try: - engine = ExecutionEngine(args) + engine = ExecutionEngine(args, output_directory) except ValueError as e: logger.error(str(e)) + sys.exit("Confirm you are running the report tool on a collect bundle") if args.algorithm: plugins.append(Plugin(opts=vars(args))) else: if args.plugin: for p in args.plugin: - path = os.path.join(args.directory, "plugins", p) + path = os.path.join(default_path, "report", "plugins", p) if os.path.exists(path): try: plugins.append(Plugin(path)) @@ -243,7 +325,7 @@ else: else: logger.warning(f"{p} plugin does not exist") else: - path = os.path.join(args.directory, "plugins") + path = os.path.join(default_path, "report", "plugins") if not os.path.exists(path): os.mkdir(path) logger.error("Plugins folder is empty")