Report Tool: Package and add plugins/correlator

This update packages the report tool and plugin files into Debian, and
bundles it with the collect tool so that they are added to the
'collect' tarballs at the time of creation.

The report tool now allows users to point it at any collect bundle and
have it automatically extract the tarball and tar files for each host
before running.

This update also adds heartbeat loss, maintenance errors, daemon
failures, and state changes plugin algorithms to the report tool.
Some of the existing algorithms were enhanced to extract more relevant
log events.

Lastly, there is a correlator function implemented into the tool that
determines failures in collect bundles and their root causes, as well
as finds significant events and state changes from the log files.
They are presented in output files and summaries are printed out onto
the command line.

Users can also specify if they want the correlator to only find events
and state changes for a specific host.

Test Plan:

PASS: Verify tool is packaged in Debian
PASS: Verify tool is inserted into 'collect' tarballs
PASS: Verify tool extracts tarballs and host tarfiles
PASS: Verify tool can point at any collect bundle and run successfully
PASS: Verify substring plugin algorithm is working
PASS: Verify swact activity plugin algorithm is working
PASS: Verify heartbeat loss plugin algorithm is working
PASS: Verify maintenance errors plugin algorithm is working
PASS: Verify daemon failures plugin algorithm is working
PASS: Verify state changes plugin algorithm is working
PASS: Verify failures and correct root causes are found by correlator
PASS: Verify significant events are found by correlator
PASS: Verify state changes are found by correlator
PASS: Verify failures, events and state changes are printed into files
PASS: Verify tool prints correct info onto command line
PASS: Verify correlator only finds events for specified host
PASS: Verify correlator only finds state changes for specified host

Story: 2010166
Task: 46177
Signed-off-by: Angela Mao <Angela.Mao@windriver.com>
Change-Id: I02e28edf16b342abf2224cc98325d77ba0678055
This commit is contained in:
Angela Mao 2022-11-30 23:33:52 +00:00
parent 5a653a9e4b
commit ceba423fd1
19 changed files with 1043 additions and 133 deletions

View File

@ -2941,6 +2941,56 @@ collect_subclouds()
fi
}
############################################################################
#
# Name : get_report_tool
#
# Purpose : Fetch report tool from current host
#
# Parameters: $1 - local path destination
#
############################################################################
function get_report_tool()
{
local local_dest=${1}
mkdir -p ${local_dest}
cp -r /usr/local/bin/report/tool ${local_dest}
local rc=${?}
if [ ${rc} -ne ${PASS} ] ; then
report_error "failed to get report tool from /usr/local/bin" ${rc}
else
ilog "copied report tool from host"
fi
}
############################################################################
#
# Name : get_report_plugins
#
# Purpose : Fetch plugins for report tool from current host
#
# Parameters: $1 - local path destination
#
############################################################################
function get_report_plugins()
{
local local_dest=${1}
mkdir -p ${local_dest}
cp -r /etc/collect/plugins ${local_dest}
local rc=${?}
if [ ${rc} -ne ${PASS} ] ; then
report_error "failed to get report plugins from /etc/collect" ${rc}
else
ilog "copied plugins for report tool from host"
fi
}
############################################################################
#
# Handle subcloud and system hosts batched collect
@ -3031,6 +3081,12 @@ echo -n "creating ${COLLECT_TYPE} tarball ${TARBALL_NAME} ... "
remove_file_local ${COLLECT_ERROR_LOG}
remove_file_local ${HOST_COLLECT_ERROR_LOG}
get_report_tool ${COLLECT_DIR}/report
get_report_plugins ${COLLECT_DIR}/report
cd ${COLLECT_DIR}
tar -czf report_tool.tgz report
rm -r report
/usr/bin/expect << EOF
log_user ${USER_LOG_MODE}

View File

@ -3,3 +3,4 @@ etc/collect.d/* /etc/collect.d
usr/local/sbin/* /usr/local/sbin
usr/local/bin/collect /usr/local/bin
usr/sbin/collect /usr/sbin
/usr/local/bin/report/* /usr/local/bin/report

View File

@ -13,8 +13,10 @@ override_dh_auto_install:
install -m 755 -d $(SYSCONFDIR)/collect.d
install -m 755 -d $(SYSCONFDIR)/collect
install -m 755 -d $(SYSCONFDIR)/collect/plugins # Report Tool
install -m 755 -d $(ROOT)/usr/local/sbin
install -m 755 -d $(ROOT)/usr/local/bin
install -m 755 -d $(ROOT)/usr/local/bin/report/tool # Report Tool
install -m 755 -d $(SBINDIR)
install -m 755 -p collect $(ROOT)/usr/local/sbin/collect
@ -26,6 +28,24 @@ override_dh_auto_install:
install -m 755 -p expect_done $(ROOT)/usr/local/sbin/expect_done
install -m 755 -p mariadb-cli.sh $(ROOT)/usr/local/sbin/mariadb-cli
# Report Tool
install -m 755 -p report/report.py $(ROOT)/usr/local/bin/report/tool/report.py
install -m 755 -p report/execution_engine.py $(ROOT)/usr/local/bin/report/tool/execution_engine.py
install -m 755 -p report/algorithms.py $(ROOT)/usr/local/bin/report/tool/algorithms.py
install -m 755 -p report/plugin.py $(ROOT)/usr/local/bin/report/tool/plugin.py
install -m 755 -p report/correlator.py $(ROOT)/usr/local/bin/report/tool/correlator.py
install -m 755 -p report/README $(ROOT)/usr/local/bin/report/tool/README
install -m 755 -p report/plugins/alarm $(SYSCONFDIR)/collect/plugins/alarm
install -m 755 -p report/plugins/daemon_failures $(SYSCONFDIR)/collect/plugins/daemon_failures
install -m 755 -p report/plugins/heartbeat_loss $(SYSCONFDIR)/collect/plugins/heartbeat_loss
install -m 755 -p report/plugins/maintenance_errors $(SYSCONFDIR)/collect/plugins/maintenance_errors
install -m 755 -p report/plugins/process_failures $(SYSCONFDIR)/collect/plugins/process_failures
install -m 755 -p report/plugins/puppet_errors $(SYSCONFDIR)/collect/plugins/puppet_errors
install -m 755 -p report/plugins/state_changes $(SYSCONFDIR)/collect/plugins/state_changes
install -m 755 -p report/plugins/substring $(SYSCONFDIR)/collect/plugins/substring
install -m 755 -p report/plugins/swact_activity $(SYSCONFDIR)/collect/plugins/swact_activity
install -m 755 -p report/plugins/system_info $(SYSCONFDIR)/collect/plugins/system_info
install -m 755 -p collect_sysinv.sh $(SYSCONFDIR)/collect.d/collect_sysinv
install -m 755 -p collect_psqldb.sh $(SYSCONFDIR)/collect.d/collect_psqldb
install -m 755 -p collect_mariadb.sh $(SYSCONFDIR)/collect.d/collect_mariadb

View File

@ -13,26 +13,28 @@ SELECT_NODES_20220527.193605
│  ├── etc
│  ├── root
│  └── var
├── plugins (where the plugin files will be placed)
│  ├── alarm_plugin_example
│  └── substring_plugin_example
├── report
└── tool (where the tool will be placed)
├── plugins (where the plugin files will be placed)
│  ├── alarm
│  ├── substring
│  └── ...
├── tool (where the tool will be placed)
└── output (where the output files will be placed)
> cat plugins/alarm_plugin_example
> cat plugins/alarm
algorithm=alarm
alarm_ids=400.,401.
entity_ids = host=controller-0
alarm_ids=400., 401.
entity_ids=host=controller-0, host=controller-1
> cat plugins/substring_plugin_example
> cat plugins/substring
algorithm=substring
files=var/log/mtcAgent.log
files=var/log/mtcAgent.log, var/log/sm.log
hosts=controllers
substring=operation failed
substring=Failed to send message
> report/tool/report.py --start 20220501 --end 20220530
@ -41,7 +43,8 @@ The tool also provides default values, more details are in 'report.py -h'.
The substring algorithm creates an output file for every host of the
specified host type. The files will contain log events within the
provided date range containing the substring 'operation failed'.
provided date range containing the substring 'operation failed' and 'Failed
to send message'.
The alarm algorithm creates two output file: 'log' and 'alarm'
'log' contains customer log messages created within the provided date range,
@ -53,10 +56,14 @@ Here is the report directory after running the above command
report
├── output
│ └── 20220815.140008 (time in utc when tool was ran)
│ ├── alarm
│ ├── controller-0_substring_plugin_example_substring
│ ├── controller-1_substring_plugin_example_substring
│ ├── report.log (log file for report tool)
│ └── log
└── tool (where the report tool is)
│ └── SELECT_NODES_20220527.193605 (collect bundle that the report tool was run on)
│ ├── plugins (output files for plugins)
│ │ ├── alarm
│ │ └── ...
│ ├── correlator_failures
│ ├── correlator_events
│ ├── correlator_state_changes
│ ├── report.log (log file for report tool)
│ └── untar.log (log file for untarring collect bundle and host tar files)
├── plugins (where the plugins files are)
└── tool (where the report tool is)

View File

@ -9,8 +9,12 @@
# Algorithm string constants
ALARM = "alarm"
AUDIT = "audit"
PROCESS_FAILURE = "process_failure"
PUPPET = "puppet"
DAEMON_FAILURES = "daemon_failures"
HEARTBEAT_LOSS = "heartbeat_loss"
MAINTENANCE_ERR = "maintenance_errors"
PROCESS_FAILURES = "process_failures"
PUPPET_ERRORS = "puppet_errors"
STATE_CHANGES = "state_changes"
SUBSTRING = "substring"
SWACT = "swact"
SWACT_ACTIVITY = "swact_activity"
SYSTEM_INFO = "system_info"

View File

@ -0,0 +1,471 @@
########################################################################
#
# Copyright (c) 2022 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
########################################################################
#
# This file contains the Correlator class.
# The Correlator class contains algorithms that search for failures.
#
# The Correlator class reads through all the output files created by
# the plugins and detects failures. A summary of the failures and their
# causes are printed to standard output and an output file is created
# in the report directory.
#
########################################################################
from datetime import datetime
from datetime import timedelta
import logging
import os
import re
logger = logging.getLogger(__name__)
class Correlator:
def __init__(self, plugin_output_dir):
"""Constructor for the Correlator class
Parameters:
plugin_output_dir (string): Path to directory with output files from plugins
"""
self.plugin_output_dir = plugin_output_dir
def run(self, hostname):
"""
Searches through the output files created by the plugins for failures and
determines their causes, as well as extracts significant events and state changes
Errors:
FileNotFoundError
"""
failures = []
try:
failures += self.uncontrolled_swact()
except FileNotFoundError as e:
logger.error(e)
try:
failures += self.mtc_errors()
except FileNotFoundError as e:
logger.error(e)
events = []
try:
events += self.get_events(hostname)
except FileNotFoundError as e:
logger.error(e)
state_changes = []
try:
state_changes += self.get_state_changes(hostname)
except FileNotFoundError as e:
logger.error(e)
return sorted(failures), sorted(events), sorted(state_changes)
def uncontrolled_swact(self):
"""
Searches through the output file created by the swact activity plugin for
uncontrolled swacts and determines their causes through other indicators
Errors:
FileNotFoundError
"""
data = []
# Variables to keep track of indicators for failure causes
start_time = end_time = svc_failed = None
ctrlr_down = None # Active controller that went down, causing swact
ctrlr_svc_fail = None # Active controller where service failed twice in 2 minutes
ctrlr_link_down = None # Original active controller when link between two went down
hb_loss = active_failed = go_active_failed = link_down = False
# Open output file from swact activity plugin and read it
file_path = os.path.join(self.plugin_output_dir, "swact_activity")
swact_activity = open(file_path, "r")
line = swact_activity.readline()
while line:
if "Uncontrolled swact" in line and not start_time:
start_time = datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S")
if "Host from active to failed, Peer from standby to active" in line:
link_down = True
ctrlr_link_down = re.findall("\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3} (.+) sm:",
line)[0]
elif re.search("Neighbor (.+) is now in the down", line) and start_time and not ctrlr_down:
ctrlr_down = re.findall("Neighbor \((.+)\) received event", line)[0]
elif re.search("Service (.+) is failed and has reached max failures", line) and not svc_failed:
svc_failed = re.findall("Service \((.+)\) is failed", line)[0]
ctrlr_svc_fail = re.findall("\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3} (.+) sm:", line)[0]
elif svc_failed and re.search("active-failed\s+\| disabling-failed\s+\| " + svc_failed, line):
if re.search("\| go-active-failed\s+\|", line):
go_active_failed = True
else:
active_failed = True
elif "Swact update" in line and start_time and not end_time:
end_time = datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S")
if ctrlr_down:
try:
hb_loss = self.search_hb_loss(start_time, end_time, ctrlr_down)
except FileNotFoundError as e:
logger.error(e)
start_time = start_time.strftime("%Y-%m-%dT%H:%M:%S")
end_time = end_time.strftime("%Y-%m-%dT%H:%M:%S")
if link_down:
data.append(start_time + " to " + end_time + " Uncontrolled swact, refer to SM "
"logs for in-depth analysis, original active controller: " +
ctrlr_link_down + "\n")
elif ctrlr_down:
if hb_loss:
data.append(start_time + " to " + end_time + " Uncontrolled swact due to "
"spontaneous reset of active controller " + ctrlr_down + "\n")
else:
data.append(start_time + " to " + end_time + " Uncontrolled swact likely due "
"to spontaneous reset of active controller " + ctrlr_down + "\n")
elif svc_failed:
if active_failed and go_active_failed:
data.append(start_time + " to " + end_time + " Uncontrolled swact due to "
"service failure (" + svc_failed + ") twice in 2 minutes was "
"unsuccessful so \"bounced back\" to original active controller " +
ctrlr_svc_fail + "\n")
elif active_failed:
data.append(start_time + " to " + end_time + " Uncontrolled swact due to "
"service failure (" + svc_failed + ") twice in 2 minutes on "
"active controller " + ctrlr_svc_fail + "\n")
else:
data.append(start_time + " to " + end_time + " Uncontrolled swact likely due "
"to service failure (" + svc_failed + ") twice in 2 minutes on "
"active controller " + ctrlr_svc_fail + "\n")
start_time = end_time = svc_failed = ctrlr_down = ctrlr_svc_fail = ctrlr_link_down = None
hb_loss = active_failed = go_active_failed = link_down = False
# Read next line
line = swact_activity.readline()
# Close the output file from swact activity plugin
swact_activity.close()
return data
def mtc_errors(self):
"""
Searches through the output file created by the maintenance errors plugin
for failures and determines their causes through other indicators
Errors:
FileNotFoundError
"""
data = []
# Variables to keep track of indicators for failure causes
goenable_start = goenable_end = goenable_host = None
goenable_tst_f = config_tst_f = None # Tests failed
config_start = config_end = config_host = puppet_error = None
hb_loss_start = hb_loss_end = hb_loss_host = None
daemon_fail = comm_loss = auto_recov_dis = False
# Open output file from maintenance errors plugin and read it
file_path = os.path.join(self.plugin_output_dir, "maintenance_errors")
mtc = open(file_path, "r")
line = mtc.readline()
while line:
if "auto recovery disabled" in line and not auto_recov_dis:
# Check if previous failure recorded was go-enable, configuration or heartbeat failure
if data and re.search(r"Go-enable|[cC]onfiguration|Heartbeat", data[-1]):
host = re.findall("failure on ([^\s]+)", data[-1])
# Check if host in auto recovery disabled mode is same as host with previous failure
if host and re.search(host[0] + " auto recovery disabled", line):
old = data[-1].split("due", 1)
if len(old) == 1:
data[-1] = data[-1][:-1] + " (auto recovery disabled)\n"
else:
data[-1] = old[0] + "(auto recovery disabled) due" + old[1]
auto_recov_dis = True
elif "GOENABLED Failed" in line and not goenable_start:
goenable_start, auto_recov_dis = line[0:19], False
goenable_host = re.findall("Error : (.+) got GOENABLED Failed", line)[0]
elif "configuration failed or incomplete" in line and not config_start:
config_start = datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S")
auto_recov_dis = False
config_host = re.findall("Error : (.+) configuration failed", line)[0]
elif "Heartbeat Loss" in line:
# Check if previous failure recorded was heartbeat loss due to missing heartbeat messages
if ("(during recovery soak)" in line and data and
re.search("missing heartbeat messages", data[-1])):
host = re.findall("failure on (.+) due to", data[-1])[0]
# Check if host with hearbeat loss failure is the same as host with previous failure
if re.search(host + " (.+) Heartbeat Loss (.+) \(during recovery soak\)", line):
old = data[-1]
data[-1] = (old[0:23] + line[0:19] + old[42:-1] +
" (recovery over disabled due to heartbeat soak failure)\n")
else:
hb_loss_start, comm_loss, auto_recov_dis = line[0:19], False, False
hb_loss_host = re.findall("Error : (.+) [CM]", line)[0]
# Check if previous failure recorded was heartbeat loss due to missing heartbeat messages
elif ("regained MTCALIVE from host that has rebooted" in line and data and
re.search("Heartbeat loss failure (.+) \(recovery over disabled\)", data[-1])):
host = re.findall("failure on (.+) due to", data[-1])[0]
if re.search(host + " regained MTCALIVE", line):
old = data[-1].split("due", 1)[0]
data[-1] = old[0:23] + line[0:19] + old[42:] + "due to uncontrolled reboot\n"
elif (hb_loss_host and re.search(hb_loss_host + " Loss Of Communication for 5 seconds", line)
and hb_loss_start and not comm_loss):
comm_loss = True
elif re.search("mtcClient --- (.+)Error : FAILED:", line):
if goenable_start and not goenable_tst_f:
goenable_tst_f = re.findall("Error : FAILED: (.+) \(\d", line)[0]
elif config_start and not config_tst_f:
config_tst_f = re.findall("Error : FAILED: (.+) \(\d", line)[0]
elif (goenable_host and
re.search(goenable_host + " Task: In-Test Failure, threshold reached", line) and not
goenable_end):
goenable_end = line[0:19]
if goenable_tst_f:
data.append(goenable_start + " to " + goenable_end + " Go-enable test failure on " +
goenable_host + " due to failing of " + goenable_tst_f + "\n")
else:
data.append(goenable_start + " to " + goenable_end + " Go-enable test failure on " +
goenable_host + " due to unknown test failing\n")
goenable_start = goenable_end = goenable_host = goenable_tst_f = None
elif (config_host and
re.search(config_host + " Task: Configuration failure, threshold reached", line) and not
config_end):
config_end = datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S")
if config_tst_f != "/etc/goenabled.d/config_goenabled_check.sh":
try:
daemon_fail = self.search_daemon_fail(config_start, config_end, config_host)
except FileNotFoundError as e:
logger.error(e)
if config_tst_f == "/etc/goenabled.d/config_goenabled_check.sh" or daemon_fail:
try:
puppet_error = self.search_puppet_error(config_start, config_end)
except FileNotFoundError as e:
logger.error(e)
config_start = config_start.strftime("%Y-%m-%dT%H:%M:%S")
config_end = config_end.strftime("%Y-%m-%dT%H:%M:%S")
if puppet_error:
data.append(config_start + " to " + config_end + " Configuration failure on " +
config_host + " due to:\n" + puppet_error)
else:
data.append(config_start + " to " + config_end + " Configuration failure on " +
config_host + " due to unknown cause\n")
else:
config_start = config_start.strftime("%Y-%m-%dT%H:%M:%S")
config_end = config_end.strftime("%Y-%m-%dT%H:%M:%S")
data.append(config_start + " to " + config_end + " Possible configuration failure"
" on " + config_host + "\n")
config_start = config_end = config_host = config_tst_f = puppet_error = None
daemon_fail = False
elif (hb_loss_host and re.search(hb_loss_host + " Connectivity Recovered ", line) and
hb_loss_start and not hb_loss_end):
hb_loss_end = line[0:19]
data.append(hb_loss_start + " to " + hb_loss_end + " Heartbeat loss failure on " +
hb_loss_host + " due to too many missing heartbeat messages\n")
hb_loss_start = hb_loss_end = hb_loss_host = None
comm_loss = False
elif (hb_loss_host and re.search(hb_loss_host + " Graceful Recovery Wait", line) and
hb_loss_start and comm_loss and not hb_loss_end):
hb_loss_end = line[0:19]
data.append(hb_loss_start + " to " + hb_loss_end + " Heartbeat loss failure on " +
hb_loss_host + " due to too many missing heartbeat messages (recovery over disabled)\n")
hb_loss_start = hb_loss_end = hb_loss_host = None
comm_loss = False
# Read next line
line = mtc.readline()
# Close the output file from maintenance errors plugin
mtc.close()
return data
def search_hb_loss(self, start_time, end_time, host):
"""
Searches through the output file created by the heartbeat loss plugin for heartbeat loss
message from host between one minute before start_time and end_time
Errors:
FileNotFoundError
"""
hb_loss = False
# Open output file from heartbeat loss plugin and read it
file_path = os.path.join(self.plugin_output_dir, "heartbeat_loss")
heartbeat_loss = open(file_path, "r")
line = heartbeat_loss.readline()
while line:
if re.search("Error : " + host + " (.+) Heartbeat Loss ", line):
date = datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S")
if date >= start_time - timedelta(minutes=1) and date <= end_time:
hb_loss = True
break
# Read next line
line = heartbeat_loss.readline()
# Close the output file from heartbeat loss plugin
heartbeat_loss.close()
return hb_loss
def search_daemon_fail(self, start_time, end_time, host):
"""
Searches through the output file created by the daemon failures plugin for puppet manifest
failed message from host between 10 seconds before start_time and end_time
Errors:
FileNotFoundError
"""
daemon_fail = False
# Open output file from daemon failures plugin and read it
file_path = os.path.join(self.plugin_output_dir, "daemon_failures")
daemon_failures = open(file_path, "r")
line = daemon_failures.readline()
while line:
if re.search("\d " + host + " (.+) Failed to run the puppet manifest", line):
date = datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S")
if date >= start_time - timedelta(seconds=10) and date <= end_time:
daemon_fail = True
break
# Read next line
line = daemon_failures.readline()
# Close the output file from daemon failures plugin
daemon_failures.close()
return daemon_fail
def search_puppet_error(self, start_time, end_time):
"""
Searches through the output file created by the puppet errors plugin for error message
between 10 seconds before start_time and end_time and returns it
Errors:
FileNotFoundError
"""
puppet_log = None
# Open output file from puppet errors plugin and read it
file_path = os.path.join(self.plugin_output_dir, "puppet_errors")
puppet_errors = open(file_path, "r")
line = puppet_errors.readline()
while line:
if "Error: " in line:
date = datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S")
if date >= start_time - timedelta(seconds=10) and date <= end_time:
puppet_log = line
break
# Read next line
line = puppet_errors.readline()
# Close the output file from puppet errors plugin
puppet_errors.close()
return puppet_log
def get_events(self, hostname):
"""
Searches through the output files created by the plugins for significant events
and summarizes them
Errors:
FileNotFoundError
"""
data = []
# Open output file from maintenance errors plugin and read it
file_path = os.path.join(self.plugin_output_dir, "maintenance_errors")
mtc = open(file_path, "r")
line = mtc.readline()
while line:
if "force failed by SM" in line:
host = re.findall("Error : (.+) is being", line)[0]
if hostname == "all" or host == hostname:
data.append(line[0:19] + " " + host + " force failed by SM\n")
elif "Graceful Recovery Failed" in line:
host = re.findall("Info : (.+) Task:", line)[0]
if hostname == "all" or host == hostname:
data.append(line[0:19] + " " + host + " graceful recovery failed\n")
# Read next line
line = mtc.readline()
# Close the output file from maintenance errors plugin
mtc.close()
# Open output file from swact activity plugin and read it
file_path = os.path.join(self.plugin_output_dir, "swact_activity")
swact_activity = open(file_path, "r")
line = swact_activity.readline()
while line:
if re.search("Service (.+) is failed and has reached max failures", line):
host = re.findall("\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3} (.+) sm:", line)[0]
svc_failed = re.findall("Service \((.+)\) is failed", line)[0]
if hostname == "all" or host == hostname:
data.append(line[0:19] + " " + host + " service failure (" + svc_failed + ")\n")
# Read next line
line = swact_activity.readline()
# Close the output file from swact activity plugin
swact_activity.close()
return data
def get_state_changes(self, hostname):
"""
Searches through the output files created by the state changes plugin and
summarizes the changes of state of the hosts
Errors:
FileNotFoundError
"""
data = []
# Open output file from state changes plugin and read it
file_path = os.path.join(self.plugin_output_dir, "state_changes")
state_changes = open(file_path, "r")
line = state_changes.readline()
while line:
if "is ENABLED" in line:
host = re.findall("Info : (.+) is ENABLED", line)[0]
state = re.findall("is (.+)\n", line)[0].lower()
if hostname == "all" or hostname in host:
data.append(line[0:19] + " " + host + " " + state + "\n")
elif "locked-disabled" in line:
host = re.findall("Info : (.+) u?n?locked-disabled", line)[0]
if hostname == "all" or host == hostname:
data.append(line[0:19] + " " + host + " disabled\n")
# Read next line
line = state_changes.readline()
# Close the output file from state changes plugin
state_changes.close()
return data

View File

@ -21,15 +21,17 @@ import os
import re
import shutil
import subprocess
import tarfile
import algorithms
from correlator import Correlator
logger = logging.getLogger(__name__)
class ExecutionEngine:
def __init__(self, opts):
def __init__(self, opts, output_directory):
"""Constructor for the ExecutionEngine class
Parameters:
@ -39,6 +41,19 @@ class ExecutionEngine:
self.hosts = {"controllers": {}, "workers": {}, "storages": {}}
self.active_controller_directory = None
# Uncompresses host tar files if not already done
with open(os.path.join(output_directory, "untar.log"), "a") as log_file:
for obj in (os.scandir(self.opts.directory)):
info = os.path.splitext(obj.name)
if (obj.is_file() and obj.name != "report_tool.tgz" and tarfile.is_tarfile(obj.path)
and not os.path.isdir(os.path.join(self.opts.directory, info[0]))):
try:
subprocess.run(["tar", "xzfC", obj.path, self.opts.directory],
stderr=log_file, check=True)
subprocess.run(["echo","uncompressed", obj.name], check=True)
except subprocess.CalledProcessError as e:
logger.error(e)
for folder in (f.path for f in os.scandir(self.opts.directory)):
database_path = os.path.join(folder, "var", "extra", "database")
host_info_path = os.path.join(folder, "var", "extra", "host.info")
@ -67,13 +82,17 @@ class ExecutionEngine:
Errors:
FileNotFoundError
"""
plugin_output_dir = os.path.join(output_directory, "plugins")
os.makedirs(plugin_output_dir, exist_ok=True)
for plugin in plugins:
logger.info(f"Processing plugin: {os.path.basename(plugin.file)}")
processing = "Processing plugin: " + os.path.basename(plugin.file)
hosts = {}
if (
plugin.state["hosts"] and len(plugin.state["hosts"]) >= 1
): # if host list is given
logger.info(f"Processing plugin: {os.path.basename(plugin.file)}")
for h in plugin.state["hosts"]:
if h == "all":
hosts.update(self.hosts["workers"])
@ -86,24 +105,21 @@ class ExecutionEngine:
events = []
if plugin.state["algorithm"] == algorithms.SUBSTRING:
try:
events = self.substring(
plugin.state["substring"],
[
os.path.join(folderpath, file)
for file in plugin.state["files"]
],
)
except FileNotFoundError as e:
logger.error(e)
continue
events = self.substring(
plugin.state["substring"],
[
os.path.join(folderpath, file)
for file in plugin.state["files"]
],
)
# creating output file
output_file = os.path.join(
output_directory,
f"{hostname}_{os.path.basename(plugin.file)}_{plugin.state['algorithm']}",
plugin_output_dir,
f"substring_{hostname}",
)
logger.info("output at " + output_file)
if self.opts.verbose:
logger.info("output at " + os.path.relpath(output_file))
with open(output_file, "w") as file:
file.write(
f"Date range: {self.opts.start} until {self.opts.end}\n"
@ -112,18 +128,24 @@ class ExecutionEngine:
f"substrings: {' '.join(plugin.state['substring'])}\n"
)
for line in events:
file.write(line + "\n")
if line[-1] == "\n":
file.write(line)
else:
file.write(line + "\n")
else:
if plugin.state["algorithm"] == algorithms.SYSTEM_INFO:
info = self.system_info()
system_info_output = os.path.join(output_directory, "system_info")
system_info_output = os.path.join(plugin_output_dir, "system_info")
with open(system_info_output, "w") as file:
for i in info:
file.write(i + "\n")
for k, v in self.hosts.items():
file.write(f"{k}: {','.join(v.keys())}\n")
logger.info("output at " + system_info_output)
if self.opts.verbose:
logger.info(processing + ", output at " + os.path.relpath(system_info_output))
else:
logger.info(processing)
elif plugin.state["algorithm"] == algorithms.AUDIT:
hosts = {}
@ -134,7 +156,7 @@ class ExecutionEngine:
for hostname, folderpath in hosts.items():
self._create_output_file(
f"{hostname}_audit",
output_directory,
plugin_output_dir,
self.audit(
plugin.state["start"],
plugin.state["end"],
@ -142,30 +164,30 @@ class ExecutionEngine:
folderpath, "var", "log", "dcmanager", "audit.log"
),
),
processing,
)
elif plugin.state["algorithm"] == algorithms.SWACT:
elif plugin.state["algorithm"] == algorithms.SWACT_ACTIVITY:
self._create_output_file(
"swact_activity", output_directory, self.swact()
"swact_activity", plugin_output_dir, self.swact_activity(), processing
)
elif plugin.state["algorithm"] == algorithms.PUPPET:
elif plugin.state["algorithm"] == algorithms.PUPPET_ERRORS:
self._create_output_file(
"puppet_errors", output_directory, self.puppet()
"puppet_errors", plugin_output_dir, self.puppet_errors(), processing
)
elif plugin.state["algorithm"] == algorithms.PROCESS_FAILURE:
elif plugin.state["algorithm"] == algorithms.PROCESS_FAILURES:
self._create_output_file(
"process_failures", output_directory, self.process_failure()
"process_failures", plugin_output_dir, self.process_failures(), processing
)
elif plugin.state["algorithm"] == algorithms.ALARM:
alarms, logs = self.alarm(
plugin.state["alarm_ids"], plugin.state["entity_ids"]
)
alarm_output = os.path.join(output_directory, "alarm")
log_output = os.path.join(output_directory, "log")
os.makedirs(os.path.dirname(log_output), exist_ok=True)
alarm_output = os.path.join(plugin_output_dir, "alarm")
log_output = os.path.join(plugin_output_dir, "log")
# creating output alarm file
with open(alarm_output, "w") as file:
@ -186,8 +208,34 @@ class ExecutionEngine:
file.write(f"{k}\n")
for date in v["dates"]:
file.write(f" {date}\n")
logger.info("output at " + alarm_output)
logger.info("output at " + log_output)
if self.opts.verbose:
logger.info(processing + ", output at " + os.path.relpath(alarm_output) +
", " + os.path.relpath(log_output))
else:
logger.info(processing)
elif plugin.state["algorithm"] == algorithms.HEARTBEAT_LOSS:
self._create_output_file(
"heartbeat_loss", plugin_output_dir, self.heartbeat_loss(), processing
)
elif plugin.state["algorithm"] == algorithms.MAINTENANCE_ERR:
self._create_output_file(
"maintenance_errors", plugin_output_dir, self.maintenance_errors(), processing
)
elif plugin.state["algorithm"] == algorithms.DAEMON_FAILURES:
self._create_output_file(
"daemon_failures", plugin_output_dir, self.daemon_failures(), processing
)
elif plugin.state["algorithm"] == algorithms.STATE_CHANGES:
self._create_output_file(
"state_changes", plugin_output_dir, self.state_changes(), processing
)
if not self.opts.verbose:
logger.info("Output files for plugins can be found at " +
os.path.relpath(plugin_output_dir))
# Running the correlator and printing the output from it
self.run_correlator(output_directory, plugin_output_dir)
# Built-in algorithms ------------------------------
def alarm(self, alarm_ids=[], entity_ids=[]):
@ -299,32 +347,51 @@ class ExecutionEngine:
data = []
for file in files:
if not os.path.exists(file):
raise FileNotFoundError(f"File not found: {file}")
cont = True
# Searching through file
command = f"""grep -Ea "{'|'.join(s for s in substr)}" {file}"""
status = self._continue(file)
try:
if not os.path.exists(file):
if re.search("controller-1_(.+)/var/log/mtcAgent.log", file):
continue
raise FileNotFoundError(f"File not found: {file}")
cont = True
# Searching through file
command = f"""grep -Ea "{'|'.join(s for s in substr)}" {file} 2>/dev/null"""
status = self._continue(file)
if (
status == CONTINUE_CURRENT or status == CONTINUE_CURRENT_OLD
): # continue with current file
if status == CONTINUE_CURRENT:
cont = False
self._evaluate_substring(data, command)
# Searching through rotated log files
n = 1
while os.path.exists(f"{file}.{n}.gz") and cont:
command = f"""zgrep -E "{'|'.join(s for s in substr)}" {file}.{n}.gz"""
status = self._continue(f"{file}.{n}.gz", compressed=True)
if status == CONTINUE_CURRENT or status == CONTINUE_CURRENT_OLD:
if (
status == CONTINUE_CURRENT or status == CONTINUE_CURRENT_OLD
): # continue with current file
if status == CONTINUE_CURRENT:
cont = False
self._evaluate_substring(data, command)
n += 1
# Searching through rotated log files that have not been compressed
n = 1
while os.path.exists(f"{file}.{n}") and cont:
command = f"""grep -Ea "{'|'.join(s for s in substr)}" {file}.{n} 2>/dev/null"""
status = self._continue(f"{file}.{n}")
if status == CONTINUE_CURRENT or status == CONTINUE_CURRENT_OLD:
if status == CONTINUE_CURRENT:
cont = False
self._evaluate_substring(data, command)
n += 1
# Searching through rotated log files
while os.path.exists(f"{file}.{n}.gz") and cont:
command = f"""zgrep -E "{'|'.join(s for s in substr)}" {file}.{n}.gz 2>/dev/null"""
status = self._continue(f"{file}.{n}.gz", compressed=True)
if status == CONTINUE_CURRENT or status == CONTINUE_CURRENT_OLD:
if status == CONTINUE_CURRENT:
cont = False
self._evaluate_substring(data, command)
n += 1
except FileNotFoundError as e:
logger.error(e)
continue
return sorted(data)
@ -368,7 +435,7 @@ class ExecutionEngine:
return data
def swact(self):
def swact_activity(self):
"""Swact activity algorithm
Presents all swacting activity in the system
"""
@ -382,8 +449,12 @@ class ExecutionEngine:
for _, folder in self.hosts["controllers"].items():
sm_path = os.path.join(folder, "var", "log", "sm.log")
sm_files.append(sm_path)
sm_customer_path = os.path.join(folder, "var", "log", "sm-customer.log")
sm_customer_files.append(sm_customer_path)
sm_substrings = ["Swact has started,", "Swact update"]
sm_substrings = ["Uncontrolled swact", "Swact has started,",
"Neighbor (.+) is now in the down",
"Service (.+) has reached max failures", "Swact update"]
data = self.substring(sm_substrings, sm_files)
for i, line in enumerate(data):
@ -396,28 +467,25 @@ class ExecutionEngine:
line += f" SWACT TOOK {swact_end - swact_start} \n"
data[i] = line
for _, folder in self.hosts["controllers"].items():
sm_customer_path = os.path.join(folder, "var", "log", "sm-customer.log")
sm_customer_files.append(sm_customer_path)
sm_customer_substrings = ["swact"]
sm_customer_substrings = ["swact", "active-failed\s+\| disabling-failed\s+\|"]
data += self.substring(sm_customer_substrings, sm_customer_files)
return sorted(data)
def puppet(self):
"""Puppet error algorithm
def puppet_errors(self):
"""Puppet errors algorithm
Presents log errors from puppet logs
"""
data = []
for _, folder in self.hosts["controllers"].items():
puppet_folder = os.path.join(folder, "var", "log", "puppet")
command = f"grep -rh 'Error:' {puppet_folder}"
self._evaluate_substring(data, command)
for host_type in self.hosts.keys():
for _, folder in self.hosts[host_type].items():
puppet_folder = os.path.join(folder, "var", "log", "puppet")
command = f"""grep -rh "[m ]Error: " {puppet_folder} 2>/dev/null"""
self._evaluate_substring(data, command)
return sorted(data)
def process_failure(self):
"""Process failure algorithm
def process_failures(self):
"""Process failures algorithm
Presents log errors from pmond
"""
data = []
@ -426,9 +494,85 @@ class ExecutionEngine:
for _, folder in self.hosts[host_type].items():
pmond = os.path.join(folder, "var", "log", "pmond.log")
files.append(pmond)
data = self.substring(["Error :"], files)
return data
def heartbeat_loss(self):
"""Heartbeat loss algorithm
Presents all heartbeat loss error messages in the system
"""
data = []
hb_files = []
for _, folder in self.hosts["controllers"].items():
hb_path = os.path.join(folder, "var", "log", "hbsAgent.log")
hb_files.append(hb_path)
hb_substrings = ["Heartbeat Loss"]
data = self.substring(hb_substrings, hb_files)
return sorted(data)
def maintenance_errors(self):
"""Maintenance errors algorithm
Presents maintenance errors and other relevant log messages in the system
"""
data = []
mtc_files = []
for _, folder in self.hosts["controllers"].items():
agent = os.path.join(folder, "var", "log", "mtcAgent.log")
mtc_files.append(agent)
for host_type in self.hosts.keys():
for _, folder in self.hosts[host_type].items():
client = os.path.join(folder, "var", "log", "mtcClient.log")
mtc_files.append(client)
mtc_substrings = ["Error : ", "Configuration failure", "In-Test Failure",
"Loss Of Communication", "Graceful Recovery Wait ",
"regained MTCALIVE from host that has rebooted",
"Connectivity Recovered ; ", "auto recovery disabled",
"Graceful Recovery Failed"]
data = self.substring(mtc_substrings, mtc_files)
return sorted(data)
def daemon_failures(self):
"""Daemon failures algorithm
Presents all failed puppet manifest messages in the system
"""
data = []
daemon_files = []
for host_type in self.hosts.keys():
for _, folder in self.hosts[host_type].items():
daemon_path = os.path.join(folder, "var", "log", "daemon.log")
daemon_files.append(daemon_path)
daemon_substrings = ["Failed to run the puppet manifest"]
data = self.substring(daemon_substrings, daemon_files)
return sorted(data)
def state_changes(self):
"""State changes algorithm
Presents all messages in the system regarding the state of hosts
"""
data = []
sc_files = []
for _, folder in self.hosts["controllers"].items():
sc_path = os.path.join(folder, "var", "log", "mtcAgent.log")
sc_files.append(sc_path)
sc_substrings = ["is ENABLED", "allStateChange (.+)locked-disabled"]
data = self.substring(sc_substrings, sc_files)
return sorted(data)
def audit(self, start, end, audit_log_path):
"""Counts audit events in dcmanager within a specified date range
@ -480,6 +624,85 @@ class ExecutionEngine:
# -----------------------------------
def run_correlator(self, output_directory, plugin_output_dir):
"""Runs the correlator and prints the results differently based on if the tool was run with or
without the verbose option
Parameters:
output_directory (string) : directory to place output files from correlator
plugin_output_dir (string) : directory with output files from plugins
"""
correlator = Correlator(plugin_output_dir)
failures, events, state_changes = correlator.run(self.opts.hostname)
failures_len, events_len, state_changes_len = len(failures), len(events), len(state_changes)
failures.append("\nTotal failures found: " + str(failures_len) + "\n")
events.append("\nTotal events found: " + str(events_len) + "\n")
state_changes.append("\nTotal state changes found: " + str(state_changes_len) + "\n")
logger.info("\nRunning correlator...")
self._create_output_file("correlator_failures", output_directory, failures, "")
self._create_output_file("correlator_events", output_directory, events, "")
self._create_output_file("correlator_state_changes", output_directory, state_changes, "")
if not self.opts.verbose:
logger.info("Output can be found at " + os.path.relpath(output_directory) + "\n")
logger.info("Failures: " + str(failures_len))
for f in failures[:-1]:
if "Uncontrolled swact" in f:
logger.info(f[0:19] + " " + re.findall("active controller:? (.+)\n", f)[0] +
" uncontrolled swact")
elif "failure on" in f:
host = re.findall("failure on ([^\s]+) ", f)[0]
logger.info(f[0:19] + " " + host + " " +
re.findall("^(.+) failure on ", f[43:])[0].lower() + " failure")
else:
logger.info(f[:-1])
if failures_len != 0:
logger.info("\nEvents: " + str(events_len))
else:
logger.info("Events: " + str(events_len))
logger.info("State Changes: " + str(state_changes_len))
else:
logger.info("\nFailures: " + str(failures_len))
for f in failures[:-1]:
logger.info(f[:-1])
# Dictionary to keep track of number of times events happens on each host
events_summ = {}
for e in events[:-1]:
k = e[20:-1]
if "service failure" in k:
k = k.split(" (", 1)[0]
if not events_summ.get(k):
events_summ[k] = 1
else:
events_summ[k] += 1
if failures_len != 0:
logger.info("\nEvents: " + str(events_len))
else:
logger.info("Events: " + str(events_len))
for k, v in sorted(events_summ.items()):
logger.info(k + ": " + str(v) + " time(s)")
# Dictionary to keep track of number of times state changes happens on each host
state_changes_summ = {}
for s in state_changes[:-1]:
k = s[20:-1]
if "enabled" in k:
k = k.split("enabled", 1)[0] + "enabled"
if not state_changes_summ.get(k):
state_changes_summ[k] = 1
else:
state_changes_summ[k] += 1
if events_len != 0:
logger.info("\nState Changes: " + str(state_changes_len))
else:
logger.info("State Changes: " + str(state_changes_len))
for k, v in sorted(state_changes_summ.items()):
logger.info(k + ": " + str(v) + " time(s)")
def _continue(self, file, compressed=False):
CONTINUE_CURRENT = 0 # don't analyze older files, continue with current file
CONTINUE_CURRENT_OLD = 1 # analyze older files, continue with current file
@ -519,7 +742,7 @@ class ExecutionEngine:
datetime.strptime(date, "%Y-%m-%dT%H:%M:%S")
if date > self.opts.start and date < self.opts.end:
if line[0] == "|": # sm-customer.log edge case
line = line.replace("|", "").strip()
line = line[1:].strip()
line = re.sub("\s+", " ", line)
data.append(line)
break
@ -531,16 +754,26 @@ class ExecutionEngine:
GROUP_ONE = 1
with open(host_info_path) as file:
for line in file:
hostname_match = re.match("^hostname => (.+)", line)
subfunction_match = re.match("^subfunction => (.+)", line)
hostname_match = re.match(r"\s*hostname =>\s*\"?([^\"]*)(\n|\"\s*,?\s*\n)", line)
subfunction_match = re.match(r"\s*subfunction =>\s*\"?([^\"]*)(\n|\"\s*,?\s*\n)", line)
if subfunction_match:
subfunction = subfunction_match.group(GROUP_ONE)
if hostname_match:
hostname = hostname_match.group(GROUP_ONE)
return hostname, subfunction
def _create_output_file(self, filename, directory, events):
def _create_output_file(self, filename, directory, data, processing):
with open(os.path.join(directory, filename), "w") as file:
for i in events:
file.write(i + "\n")
logger.info("output at " + os.path.join(directory, filename))
for i in data:
if i[-1] == "\n":
file.write(i)
else:
file.write(i + "\n")
if self.opts.verbose:
output = "output at " + os.path.relpath(os.path.join(directory, filename))
if processing == "":
logger.info(output)
else:
logger.info(processing + ", " + output)
elif processing != "":
logger.info(processing)

View File

@ -141,17 +141,37 @@ class Plugin:
raise ValueError(
f"plugin: {plugin_name} should not have hosts to be specified"
)
elif self.state["algorithm"] == algorithms.SWACT:
elif self.state["algorithm"] == algorithms.SWACT_ACTIVITY:
if len(self.state["hosts"]) > 0:
raise ValueError(
f"plugin: {plugin_name} should not have hosts to be specified"
)
elif self.state["algorithm"] == algorithms.PUPPET:
elif self.state["algorithm"] == algorithms.PUPPET_ERRORS:
if len(self.state["hosts"]) > 0:
raise ValueError(
f"plugin: {plugin_name} should not have hosts to be specified"
)
elif self.state["algorithm"] == algorithms.PROCESS_FAILURE:
elif self.state["algorithm"] == algorithms.PROCESS_FAILURES:
if len(self.state["hosts"]) > 0:
raise ValueError(
f"plugin: {plugin_name} should not have hosts to be specified"
)
elif self.state["algorithm"] == algorithms.HEARTBEAT_LOSS:
if len(self.state["hosts"]) > 0:
raise ValueError(
f"plugin: {plugin_name} should not have hosts to be specified"
)
elif self.state["algorithm"] == algorithms.MAINTENANCE_ERR:
if len(self.state["hosts"]) > 0:
raise ValueError(
f"plugin: {plugin_name} should not have hosts to be specified"
)
elif self.state["algorithm"] == algorithms.DAEMON_FAILURES:
if len(self.state["hosts"]) > 0:
raise ValueError(
f"plugin: {plugin_name} should not have hosts to be specified"
)
elif self.state["algorithm"] == algorithms.STATE_CHANGES:
if len(self.state["hosts"]) > 0:
raise ValueError(
f"plugin: {plugin_name} should not have hosts to be specified"

View File

@ -0,0 +1,3 @@
algorithm=alarm
alarm_ids=400., 401.
entity_ids=host=controller-0, host=controller-1

View File

@ -0,0 +1 @@
algorithm=daemon_failures

View File

@ -0,0 +1 @@
algorithm=heartbeat_loss

View File

@ -0,0 +1 @@
algorithm=maintenance_errors

View File

@ -0,0 +1 @@
algorithm=process_failures

View File

@ -0,0 +1 @@
algorithm=puppet_errors

View File

@ -0,0 +1 @@
algorithm=state_changes

View File

@ -0,0 +1,5 @@
algorithm=substring
files=var/log/mtcAgent.log, var/log/sm.log
hosts=controllers
substring=operation failed
substring=Failed to send message

View File

@ -0,0 +1 @@
algorithm=swact_activity

View File

@ -0,0 +1 @@
algorithm=system_info

View File

@ -17,9 +17,9 @@
# The report tool requires the collect bundle and host tarballs to be
# untarred.
#
# The report tool reads user plugins from a plugins directory in the
# top level of the collect bundle, and outputs files containing
# relevant logs to a report directory in the top level as well.
# The report tool reads user plugins from the report directory in the
# top level of the collect bundle, and outputs files containing files
# containing relevant logs to this directory as well.
#
# Typical Usage:
# command line functionality
@ -39,24 +39,35 @@
import argparse
from cmath import log
from datetime import datetime
from datetime import timedelta
from datetime import timezone
import logging
import os
import time
import subprocess
import sys
from execution_engine import ExecutionEngine
from plugin import Plugin
now = datetime.now(timezone.utc)
base_dir = os.path.realpath(__file__)
default_path = os.path.join(os.path.dirname(base_dir), "..", "..")
base_dir = os.path.dirname(os.path.realpath(__file__))
parent_dir = os.path.dirname(base_dir)
default_path = os.path.dirname(parent_dir)
plugins = []
parser = argparse.ArgumentParser(
description="Log Event Reporter",
epilog="Place plugins in 'plugins' directory at top level of collect bundle. Output files will be placed in 'report' directory."
"\nThis tool will create a report.log file along with other output files",
epilog="Place plugins in 'plugins' directory found in 'report' directory at top level of collect bundle. Output files"
"\nwill be placed in 'report' directory."
"\nThis tool will create a report.log and untar.log file along with other output files.",
)
parser.add_argument(
"-v",
"--verbose",
action="store_true",
help="Verbose output",
)
parser.add_argument(
"-s",
@ -67,7 +78,7 @@ parser.add_argument(
parser.add_argument(
"-e",
"--end",
default=datetime.strftime(now, "%Y%m%d"),
default=datetime.strftime(now + timedelta(days=1), "%Y%m%d"),
help="Specify an end date in YYYYMMDD format for analysis (default: current date)",
)
parser.add_argument(
@ -81,7 +92,12 @@ parser.add_argument(
"-d",
"--directory",
default=default_path,
help="Specify top level of collect bundle to analyze (default: two levels above current location)",
help="Specify top level of collect bundle to analyze (default: two levels above tool directory)",
)
parser.add_argument(
"--hostname",
default="all",
help="Specify host for correlator to find significant events and state changes for (default: all hosts)",
)
subparsers = parser.add_subparsers(help="algorithms", dest="algorithm")
@ -93,10 +109,10 @@ parser_substring = subparsers.add_parser(
There will be an output file for each host of the host type specified.""",
epilog="Plugin file example:\n"
" algorithm=substring\n"
" files=mtcAgent.log, sm.log\n"
" hosts=controllers, workers\n"
" substring=Swact in progress\n"
" substring=Swact update",
" files=var/log/mtcAgent.log, var/log/sm.log\n"
" hosts=controllers\n"
" substring=operation failed\n"
" substring=Failed to send message",
)
substring_required = parser_substring.add_argument_group("required arguments")
substring_required.add_argument(
@ -124,8 +140,8 @@ parser_alarm = subparsers.add_parser(
help="Searches through fm.db.sql.txt for alarms and logs. There are 2 output files: 'alarm', and 'log'",
epilog="Plugin file example:\n"
" algorithm=alarm\n"
" alarm_ids=400.005,200.004\n"
" entity_ids= host=controller-0,host=controller-1\n",
" alarm_ids=400.005, 200.004\n"
" entity_ids=host=controller-0, host=controller-1\n",
)
parser_alarm.add_argument(
"--alarm_ids",
@ -151,27 +167,59 @@ parser_system_info = subparsers.add_parser(
)
# swact activity algorithm
parser_swact = subparsers.add_parser(
"swact",
parser_swact_activity = subparsers.add_parser(
"swact_activity",
formatter_class=argparse.RawTextHelpFormatter,
help="Presents system swacting activity",
epilog="Plugin file example:\n" " algorithm=swact\n",
epilog="Plugin file example:\n" " algorithm=swact_activity\n",
)
# puppet errors algorithm
parser_puppet = subparsers.add_parser(
"puppet",
parser_puppet_errors = subparsers.add_parser(
"puppet_errors",
formatter_class=argparse.RawTextHelpFormatter,
help="Presents any puppet errors",
epilog="Plugin file example:\n" " algorithm=puppet\n",
epilog="Plugin file example:\n" " algorithm=puppet_errors\n",
)
# process failure algorithm
parser_process_failure = subparsers.add_parser(
"process_failure",
# process failures algorithm
parser_process_failures = subparsers.add_parser(
"process_failures",
formatter_class=argparse.RawTextHelpFormatter,
help="Presents any process failures from pmond.log",
epilog="Plugin file example:\n" " algorithm=process_failure\n",
epilog="Plugin file example:\n" " algorithm=process_failures\n",
)
# daemon failures algorithm
parser_daemon_failures = subparsers.add_parser(
"daemon_failures",
formatter_class=argparse.RawTextHelpFormatter,
help="Presents any puppet manifest failures from daemon.log",
epilog="Plugin file example:\n" " algorithm=daemon_failures\n",
)
# heartbeat loss algorithm
parser_heartbeat_loss = subparsers.add_parser(
"heartbeat_loss",
formatter_class=argparse.RawTextHelpFormatter,
help="Presents any heartbeat loss error messages from hbsAgent.log",
epilog="Plugin file example:\n" " algorithm=heartbeat_loss\n",
)
# maintenance errors algorithm
parser_maintenance_errors = subparsers.add_parser(
"maintenance_errors",
formatter_class=argparse.RawTextHelpFormatter,
help="Presents errors and other relevant messages from mtcAgent.log and mtcClient.log",
epilog="Plugin file example:\n" " algorithm=maintenance_errors\n",
)
# state changes algorithm
parser_state_changes = subparsers.add_parser(
"state_changes",
formatter_class=argparse.RawTextHelpFormatter,
help="Presents any messages from mtcAgent.log regarding the state of hosts, such as enabled/disabled",
epilog="Plugin file example:\n" " algorithm=state_changes\n",
)
# audit algorithm
@ -185,11 +233,19 @@ parser_audit = subparsers.add_parser(
" start=2022-06-01 10:00:00\n"
" end=2022-06-01 04:00:00\n",
)
parser_audit_required = parser_audit.add_argument_group("required arguments")
parser_audit_required.add_argument("--start", required=True)
parser_audit_required.add_argument(
parser_audit.add_argument(
"--start",
required=False,
default=datetime.strftime(now - timedelta(days = 7), "%Y-%m-%d %H:%M:%S"),
type=str,
help="Specify a start date in YYYY-MM-DD HH:MM:SS format for analysis (not required, default: 1 week ago)"
)
parser_audit.add_argument(
"--end",
required=True,
required=False,
default=datetime.strftime(now, "%Y-%m-%d %H:%M:%S"),
type=str,
help="Specify an end date in YYYY-MM-DD HH:MM:SS format for analysis (not required, default: today)"
)
@ -197,12 +253,17 @@ args = parser.parse_args()
args.start = datetime.strptime(args.start, "%Y%m%d").strftime("%Y-%m-%dT%H:%M:%S")
args.end = datetime.strptime(args.end, "%Y%m%d").strftime("%Y-%m-%dT%H:%M:%S")
output_directory = os.path.join(
args.directory, "report", "output", now.strftime("%Y%m%d.%H%M%S")
)
if args.directory.endswith("/"):
output_directory = os.path.join(
default_path, "report", "output", os.path.basename(os.path.dirname(args.directory))
)
else:
output_directory = os.path.join(
default_path, "report", "output", os.path.basename(args.directory)
)
# creating report log
os.makedirs(output_directory)
os.makedirs(output_directory, exist_ok=True)
open(os.path.join(output_directory, "report.log"), "w").close()
# setting up logger
@ -223,17 +284,38 @@ ch.setFormatter(formatter)
logger.addHandler(ch)
if not os.path.isdir(args.directory):
sys.exit("Top level of collect bundle given to analyze is not a directory")
else:
for obj in (os.scandir(args.directory)):
info = os.path.splitext(obj.name)
# TODO: ask user which file to report on if more than one tarball in directory
# Check if collect tarball is in given directory and extracts it if not already done
if (obj.is_file() and info[1] == ".tar"):
try:
result = subprocess.check_output(["tar", "tf", obj.path], encoding="UTF-8")
result = result.split("\n", 1)
if not os.path.isdir(os.path.join(args.directory, os.path.dirname(result[0]))):
subprocess.run(["tar", "xfC", obj.path, args.directory], check=True)
subprocess.run(["echo","extracted", obj.name], check=True)
args.directory = os.path.join(args.directory, os.path.dirname(result[0]))
break
except subprocess.CalledProcessError as e:
logger.error(e)
try:
engine = ExecutionEngine(args)
engine = ExecutionEngine(args, output_directory)
except ValueError as e:
logger.error(str(e))
sys.exit("Confirm you are running the report tool on a collect bundle")
if args.algorithm:
plugins.append(Plugin(opts=vars(args)))
else:
if args.plugin:
for p in args.plugin:
path = os.path.join(args.directory, "plugins", p)
path = os.path.join(default_path, "report", "plugins", p)
if os.path.exists(path):
try:
plugins.append(Plugin(path))
@ -243,7 +325,7 @@ else:
else:
logger.warning(f"{p} plugin does not exist")
else:
path = os.path.join(args.directory, "plugins")
path = os.path.join(default_path, "report", "plugins")
if not os.path.exists(path):
os.mkdir(path)
logger.error("Plugins folder is empty")