Report Tool: Package and add plugins/correlator
This update packages the report tool and plugin files into Debian, and bundles it with the collect tool so that they are added to the 'collect' tarballs at the time of creation. The report tool now allows users to point it at any collect bundle and have it automatically extract the tarball and tar files for each host before running. This update also adds heartbeat loss, maintenance errors, daemon failures, and state changes plugin algorithms to the report tool. Some of the existing algorithms were enhanced to extract more relevant log events. Lastly, there is a correlator function implemented into the tool that determines failures in collect bundles and their root causes, as well as finds significant events and state changes from the log files. They are presented in output files and summaries are printed out onto the command line. Users can also specify if they want the correlator to only find events and state changes for a specific host. Test Plan: PASS: Verify tool is packaged in Debian PASS: Verify tool is inserted into 'collect' tarballs PASS: Verify tool extracts tarballs and host tarfiles PASS: Verify tool can point at any collect bundle and run successfully PASS: Verify substring plugin algorithm is working PASS: Verify swact activity plugin algorithm is working PASS: Verify heartbeat loss plugin algorithm is working PASS: Verify maintenance errors plugin algorithm is working PASS: Verify daemon failures plugin algorithm is working PASS: Verify state changes plugin algorithm is working PASS: Verify failures and correct root causes are found by correlator PASS: Verify significant events are found by correlator PASS: Verify state changes are found by correlator PASS: Verify failures, events and state changes are printed into files PASS: Verify tool prints correct info onto command line PASS: Verify correlator only finds events for specified host PASS: Verify correlator only finds state changes for specified host Story: 2010166 Task: 46177 Signed-off-by: Angela Mao <Angela.Mao@windriver.com> Change-Id: I02e28edf16b342abf2224cc98325d77ba0678055
This commit is contained in:
parent
5a653a9e4b
commit
fc6cb0a607
|
@ -2941,6 +2941,56 @@ collect_subclouds()
|
|||
fi
|
||||
}
|
||||
|
||||
############################################################################
|
||||
#
|
||||
# Name : get_report_tool
|
||||
#
|
||||
# Purpose : Fetch report tool from current host
|
||||
#
|
||||
# Parameters: $1 - local path destination
|
||||
#
|
||||
############################################################################
|
||||
|
||||
function get_report_tool()
|
||||
{
|
||||
local local_dest=${1}
|
||||
|
||||
mkdir -p ${local_dest}
|
||||
cp -r /usr/local/bin/report/tool ${local_dest}
|
||||
|
||||
local rc=${?}
|
||||
if [ ${rc} -ne ${PASS} ] ; then
|
||||
report_error "failed to get report tool from /usr/local/bin" ${rc}
|
||||
else
|
||||
ilog "copied report tool from host"
|
||||
fi
|
||||
}
|
||||
|
||||
############################################################################
|
||||
#
|
||||
# Name : get_report_plugins
|
||||
#
|
||||
# Purpose : Fetch plugins for report tool from current host
|
||||
#
|
||||
# Parameters: $1 - local path destination
|
||||
#
|
||||
############################################################################
|
||||
|
||||
function get_report_plugins()
|
||||
{
|
||||
local local_dest=${1}
|
||||
|
||||
mkdir -p ${local_dest}
|
||||
cp -r /etc/collect/plugins ${local_dest}
|
||||
|
||||
local rc=${?}
|
||||
if [ ${rc} -ne ${PASS} ] ; then
|
||||
report_error "failed to get report plugins from /etc/collect" ${rc}
|
||||
else
|
||||
ilog "copied plugins for report tool from host"
|
||||
fi
|
||||
}
|
||||
|
||||
############################################################################
|
||||
#
|
||||
# Handle subcloud and system hosts batched collect
|
||||
|
@ -3031,6 +3081,17 @@ echo -n "creating ${COLLECT_TYPE} tarball ${TARBALL_NAME} ... "
|
|||
|
||||
remove_file_local ${COLLECT_ERROR_LOG}
|
||||
remove_file_local ${HOST_COLLECT_ERROR_LOG}
|
||||
get_report_tool ${COLLECT_DIR}/report
|
||||
get_report_plugins ${COLLECT_DIR}/report
|
||||
|
||||
cd ${COLLECT_DIR}
|
||||
tar -czf report_tool.tgz report
|
||||
rc=${?}
|
||||
if [ ${rc} -ne ${PASS} ] ; then
|
||||
report_error "failed to tar report tool" ${rc}
|
||||
else
|
||||
rm -r report
|
||||
fi
|
||||
|
||||
/usr/bin/expect << EOF
|
||||
log_user ${USER_LOG_MODE}
|
||||
|
|
|
@ -3,3 +3,4 @@ etc/collect.d/* /etc/collect.d
|
|||
usr/local/sbin/* /usr/local/sbin
|
||||
usr/local/bin/collect /usr/local/bin
|
||||
usr/sbin/collect /usr/sbin
|
||||
/usr/local/bin/report/* /usr/local/bin/report
|
||||
|
|
|
@ -13,8 +13,10 @@ override_dh_auto_install:
|
|||
|
||||
install -m 755 -d $(SYSCONFDIR)/collect.d
|
||||
install -m 755 -d $(SYSCONFDIR)/collect
|
||||
install -m 755 -d $(SYSCONFDIR)/collect/plugins # Report Tool
|
||||
install -m 755 -d $(ROOT)/usr/local/sbin
|
||||
install -m 755 -d $(ROOT)/usr/local/bin
|
||||
install -m 755 -d $(ROOT)/usr/local/bin/report/tool # Report Tool
|
||||
install -m 755 -d $(SBINDIR)
|
||||
|
||||
install -m 755 -p collect $(ROOT)/usr/local/sbin/collect
|
||||
|
@ -26,6 +28,24 @@ override_dh_auto_install:
|
|||
install -m 755 -p expect_done $(ROOT)/usr/local/sbin/expect_done
|
||||
install -m 755 -p mariadb-cli.sh $(ROOT)/usr/local/sbin/mariadb-cli
|
||||
|
||||
# Report Tool
|
||||
install -m 755 -p report/report.py $(ROOT)/usr/local/bin/report/tool/report.py
|
||||
install -m 755 -p report/execution_engine.py $(ROOT)/usr/local/bin/report/tool/execution_engine.py
|
||||
install -m 755 -p report/algorithms.py $(ROOT)/usr/local/bin/report/tool/algorithms.py
|
||||
install -m 755 -p report/plugin.py $(ROOT)/usr/local/bin/report/tool/plugin.py
|
||||
install -m 755 -p report/correlator.py $(ROOT)/usr/local/bin/report/tool/correlator.py
|
||||
install -m 755 -p report/README $(ROOT)/usr/local/bin/report/tool/README
|
||||
install -m 755 -p report/plugins/alarm $(SYSCONFDIR)/collect/plugins/alarm
|
||||
install -m 755 -p report/plugins/daemon_failures $(SYSCONFDIR)/collect/plugins/daemon_failures
|
||||
install -m 755 -p report/plugins/heartbeat_loss $(SYSCONFDIR)/collect/plugins/heartbeat_loss
|
||||
install -m 755 -p report/plugins/maintenance_errors $(SYSCONFDIR)/collect/plugins/maintenance_errors
|
||||
install -m 755 -p report/plugins/process_failures $(SYSCONFDIR)/collect/plugins/process_failures
|
||||
install -m 755 -p report/plugins/puppet_errors $(SYSCONFDIR)/collect/plugins/puppet_errors
|
||||
install -m 755 -p report/plugins/state_changes $(SYSCONFDIR)/collect/plugins/state_changes
|
||||
install -m 755 -p report/plugins/substring $(SYSCONFDIR)/collect/plugins/substring
|
||||
install -m 755 -p report/plugins/swact_activity $(SYSCONFDIR)/collect/plugins/swact_activity
|
||||
install -m 755 -p report/plugins/system_info $(SYSCONFDIR)/collect/plugins/system_info
|
||||
|
||||
install -m 755 -p collect_sysinv.sh $(SYSCONFDIR)/collect.d/collect_sysinv
|
||||
install -m 755 -p collect_psqldb.sh $(SYSCONFDIR)/collect.d/collect_psqldb
|
||||
install -m 755 -p collect_mariadb.sh $(SYSCONFDIR)/collect.d/collect_mariadb
|
||||
|
|
|
@ -13,26 +13,28 @@ SELECT_NODES_20220527.193605
|
|||
│ ├── etc
|
||||
│ ├── root
|
||||
│ └── var
|
||||
├── plugins (where the plugin files will be placed)
|
||||
│ ├── alarm_plugin_example
|
||||
│ └── substring_plugin_example
|
||||
├── report
|
||||
└── tool (where the tool will be placed)
|
||||
├── plugins (where the plugin files will be placed)
|
||||
│ ├── alarm
|
||||
│ ├── substring
|
||||
│ └── ...
|
||||
├── tool (where the tool will be placed)
|
||||
└── output (where the output files will be placed)
|
||||
|
||||
|
||||
> cat plugins/alarm_plugin_example
|
||||
> cat plugins/alarm
|
||||
|
||||
algorithm=alarm
|
||||
alarm_ids=400.,401.
|
||||
entity_ids = host=controller-0
|
||||
alarm_ids=400., 401.
|
||||
entity_ids=host=controller-0, host=controller-1
|
||||
|
||||
> cat plugins/substring_plugin_example
|
||||
> cat plugins/substring
|
||||
|
||||
algorithm=substring
|
||||
files=var/log/mtcAgent.log
|
||||
files=var/log/mtcAgent.log, var/log/sm.log
|
||||
hosts=controllers
|
||||
substring=operation failed
|
||||
substring=Failed to send message
|
||||
|
||||
> report/tool/report.py --start 20220501 --end 20220530
|
||||
|
||||
|
@ -41,7 +43,8 @@ The tool also provides default values, more details are in 'report.py -h'.
|
|||
|
||||
The substring algorithm creates an output file for every host of the
|
||||
specified host type. The files will contain log events within the
|
||||
provided date range containing the substring 'operation failed'.
|
||||
provided date range containing the substring 'operation failed' and 'Failed
|
||||
to send message'.
|
||||
|
||||
The alarm algorithm creates two output file: 'log' and 'alarm'
|
||||
'log' contains customer log messages created within the provided date range,
|
||||
|
@ -53,10 +56,14 @@ Here is the report directory after running the above command
|
|||
|
||||
report
|
||||
├── output
|
||||
│ └── 20220815.140008 (time in utc when tool was ran)
|
||||
│ ├── alarm
|
||||
│ ├── controller-0_substring_plugin_example_substring
|
||||
│ ├── controller-1_substring_plugin_example_substring
|
||||
│ ├── report.log (log file for report tool)
|
||||
│ └── log
|
||||
└── tool (where the report tool is)
|
||||
│ └── SELECT_NODES_20220527.193605 (collect bundle that the report tool was run on)
|
||||
│ ├── plugins (output files for plugins)
|
||||
│ │ ├── alarm
|
||||
│ │ └── ...
|
||||
│ ├── correlator_failures
|
||||
│ ├── correlator_events
|
||||
│ ├── correlator_state_changes
|
||||
│ ├── report.log (log file for report tool)
|
||||
│ └── untar.log (log file for untarring collect bundle and host tar files)
|
||||
├── plugins (where the plugins files are)
|
||||
└── tool (where the report tool is)
|
||||
|
|
|
@ -9,8 +9,12 @@
|
|||
# Algorithm string constants
|
||||
ALARM = "alarm"
|
||||
AUDIT = "audit"
|
||||
PROCESS_FAILURE = "process_failure"
|
||||
PUPPET = "puppet"
|
||||
DAEMON_FAILURES = "daemon_failures"
|
||||
HEARTBEAT_LOSS = "heartbeat_loss"
|
||||
MAINTENANCE_ERR = "maintenance_errors"
|
||||
PROCESS_FAILURES = "process_failures"
|
||||
PUPPET_ERRORS = "puppet_errors"
|
||||
STATE_CHANGES = "state_changes"
|
||||
SUBSTRING = "substring"
|
||||
SWACT = "swact"
|
||||
SWACT_ACTIVITY = "swact_activity"
|
||||
SYSTEM_INFO = "system_info"
|
||||
|
|
|
@ -0,0 +1,504 @@
|
|||
########################################################################
|
||||
#
|
||||
# Copyright (c) 2022 Wind River Systems, Inc.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
########################################################################
|
||||
#
|
||||
# This file contains the Correlator class.
|
||||
# The Correlator class contains algorithms that search for failures.
|
||||
#
|
||||
# The Correlator class reads through all the output files created by
|
||||
# the plugins and detects failures. A summary of the failures and their
|
||||
# causes are printed to standard output and an output file is created
|
||||
# in the report directory.
|
||||
#
|
||||
########################################################################
|
||||
|
||||
from datetime import datetime
|
||||
from datetime import timedelta
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Correlator:
|
||||
def __init__(self, plugin_output_dir):
|
||||
"""Constructor for the Correlator class
|
||||
|
||||
Parameters:
|
||||
plugin_output_dir (string): Path to directory with output files
|
||||
from plugins
|
||||
"""
|
||||
self.plugin_output_dir = plugin_output_dir
|
||||
|
||||
def run(self, hostname):
|
||||
"""Searches through the output files created by the plugins for
|
||||
failures and determines their causes, as well as extracts significant
|
||||
events and state changes
|
||||
|
||||
Errors:
|
||||
FileNotFoundError
|
||||
"""
|
||||
failures = []
|
||||
try:
|
||||
failures += self.uncontrolled_swact()
|
||||
except FileNotFoundError as e:
|
||||
logger.error(e)
|
||||
|
||||
try:
|
||||
failures += self.mtc_errors()
|
||||
except FileNotFoundError as e:
|
||||
logger.error(e)
|
||||
|
||||
events = []
|
||||
try:
|
||||
events += self.get_events(hostname)
|
||||
except FileNotFoundError as e:
|
||||
logger.error(e)
|
||||
|
||||
state_changes = []
|
||||
try:
|
||||
state_changes += self.get_state_changes(hostname)
|
||||
except FileNotFoundError as e:
|
||||
logger.error(e)
|
||||
|
||||
return sorted(failures), sorted(events), sorted(state_changes)
|
||||
|
||||
def uncontrolled_swact(self):
|
||||
"""Searches through the output file created by the swact activity
|
||||
plugin for uncontrolled swacts and determines their causes through
|
||||
other indicators
|
||||
|
||||
Errors:
|
||||
FileNotFoundError
|
||||
"""
|
||||
data = []
|
||||
|
||||
# Variables to keep track of indicators for failure causes
|
||||
start_time = end_time = svc_failed = None
|
||||
ctrlr_down = None # Active controller that went down, causing swact
|
||||
ctrlr_svc_fail = None # Active controller where service failed
|
||||
ctrlr_link_down = None # Orig. active controller when link went down
|
||||
hb_loss = active_failed = go_active_failed = link_down = False
|
||||
|
||||
# Open output file from swact activity plugin and read it
|
||||
file_path = os.path.join(self.plugin_output_dir, "swact_activity")
|
||||
|
||||
with open(file_path, "r") as swact_activity:
|
||||
for line in swact_activity:
|
||||
if "Uncontrolled swact" in line and not start_time:
|
||||
start_time = datetime.strptime(line[0:19],
|
||||
"%Y-%m-%dT%H:%M:%S")
|
||||
if ("Host from active to failed, Peer from standby to "
|
||||
"active" in line):
|
||||
link_down = True
|
||||
ctrlr_link_down = re.findall(
|
||||
r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3} (.+) "
|
||||
"sm:", line)[0]
|
||||
elif (re.search("Neighbor (.+) is now in the down", line)
|
||||
and start_time and not ctrlr_down):
|
||||
ctrlr_down = re.findall(
|
||||
r"Neighbor \((.+)\) received event", line)[0]
|
||||
elif (re.search("Service (.+) is failed and has reached max "
|
||||
"failures", line) and not svc_failed):
|
||||
svc_failed = re.findall(
|
||||
r"Service \((.+)\) is failed", line)[0]
|
||||
ctrlr_svc_fail = re.findall(
|
||||
r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3} (.+) sm:",
|
||||
line)[0]
|
||||
elif (svc_failed and re.search(
|
||||
"active-failed\\s+\\| disabling-failed\\s+\\| "
|
||||
+ svc_failed, line)):
|
||||
if re.search(r"\| go-active-failed\s+\|", line):
|
||||
go_active_failed = True
|
||||
else:
|
||||
active_failed = True
|
||||
elif "Swact update" in line and start_time and not end_time:
|
||||
end_time = datetime.strptime(line[0:19],
|
||||
"%Y-%m-%dT%H:%M:%S")
|
||||
if ctrlr_down:
|
||||
try:
|
||||
hb_loss = self.search_hb_loss(
|
||||
start_time, end_time, ctrlr_down)
|
||||
except FileNotFoundError as e:
|
||||
logger.error(e)
|
||||
|
||||
start_time = start_time.strftime("%Y-%m-%dT%H:%M:%S")
|
||||
end_time = end_time.strftime("%Y-%m-%dT%H:%M:%S")
|
||||
if link_down:
|
||||
data.append(start_time + " to " + end_time
|
||||
+ " Uncontrolled swact, refer to SM logs "
|
||||
"for in-depth analysis, original active "
|
||||
"controller: " + ctrlr_link_down + "\n")
|
||||
elif ctrlr_down:
|
||||
if hb_loss:
|
||||
data.append(start_time + " to " + end_time
|
||||
+ " Uncontrolled swact due to "
|
||||
"spontaneous reset of active "
|
||||
"controller " + ctrlr_down + "\n")
|
||||
else:
|
||||
data.append(start_time + " to " + end_time
|
||||
+ " Uncontrolled swact likely due to "
|
||||
"spontaneous reset of active "
|
||||
"controller " + ctrlr_down + "\n")
|
||||
elif svc_failed:
|
||||
if active_failed and go_active_failed:
|
||||
data.append(start_time + " to " + end_time
|
||||
+ " Uncontrolled swact due to service "
|
||||
"failure (" + svc_failed + ") twice "
|
||||
"in 2 minutes was unsuccessful so "
|
||||
"\"bounced back\" to original active "
|
||||
"controller " + ctrlr_svc_fail + "\n")
|
||||
elif active_failed:
|
||||
data.append(start_time + " to " + end_time
|
||||
+ " Uncontrolled swact due to service "
|
||||
"failure (" + svc_failed + ") twice "
|
||||
"in 2 minutes on active controller "
|
||||
+ ctrlr_svc_fail + "\n")
|
||||
else:
|
||||
data.append(start_time + " to " + end_time
|
||||
+ " Uncontrolled swact likely due to "
|
||||
"service failure (" + svc_failed
|
||||
+ ") twice in 2 minutes on active "
|
||||
"controller " + ctrlr_svc_fail + "\n")
|
||||
|
||||
start_time = end_time = svc_failed = None
|
||||
ctrlr_down = ctrlr_svc_fail = ctrlr_link_down = None
|
||||
hb_loss = active_failed = go_active_failed = False
|
||||
link_down = False
|
||||
|
||||
return data
|
||||
|
||||
def mtc_errors(self):
|
||||
"""Searches through the output file created by the maintenance errors
|
||||
plugin for failures and determines their causes through other
|
||||
indicators
|
||||
|
||||
Errors:
|
||||
FileNotFoundError
|
||||
"""
|
||||
data = []
|
||||
|
||||
# Variables to keep track of indicators for failure causes
|
||||
goenable_start = goenable_end = goenable_host = None
|
||||
goenable_tst_f = config_tst_f = None # Tests failed
|
||||
config_start = config_end = config_host = puppet_error = None
|
||||
hb_loss_start = hb_loss_end = hb_loss_host = None
|
||||
daemon_fail = comm_loss = auto_recov_dis = False
|
||||
|
||||
# Open output file from maintenance errors plugin and read it
|
||||
file_path = os.path.join(self.plugin_output_dir, "maintenance_errors")
|
||||
|
||||
with open(file_path, "r") as mtc:
|
||||
for line in mtc:
|
||||
if "auto recovery disabled" in line and not auto_recov_dis:
|
||||
# Check if previous failure recorded was go-enable,
|
||||
# configuration or heartbeat failure
|
||||
if (data and
|
||||
re.search(r"Go-enable|[cC]onfiguration|Heartbeat",
|
||||
data[-1])):
|
||||
host = re.findall(r"failure on ([^\s]+)", data[-1])
|
||||
# Check if host in auto recovery disabled mode is same
|
||||
# as host with previous failure
|
||||
if (host and re.search(
|
||||
host[0] + " auto recovery disabled", line)):
|
||||
old = data[-1].split("due", 1)
|
||||
if len(old) == 1:
|
||||
data[-1] = (data[-1][:-1]
|
||||
+ " (auto recovery disabled)\n")
|
||||
else:
|
||||
data[-1] = (old[0]
|
||||
+ "(auto recovery disabled) due"
|
||||
+ old[1])
|
||||
auto_recov_dis = True
|
||||
elif "GOENABLED Failed" in line and not goenable_start:
|
||||
goenable_start, auto_recov_dis = line[0:19], False
|
||||
goenable_host = re.findall(
|
||||
"Error : (.+) got GOENABLED Failed", line)[0]
|
||||
elif ("configuration failed or incomplete" in line
|
||||
and not config_start):
|
||||
config_start = datetime.strptime(line[0:19],
|
||||
"%Y-%m-%dT%H:%M:%S")
|
||||
auto_recov_dis = False
|
||||
config_host = re.findall(
|
||||
"Error : (.+) configuration failed", line)[0]
|
||||
elif "Heartbeat Loss" in line:
|
||||
# Check if previous failure recorded was heartbeat loss
|
||||
# due to missing heartbeat messages
|
||||
if ("(during recovery soak)" in line and data and
|
||||
re.search("missing heartbeat messages", data[-1])):
|
||||
host = re.findall(
|
||||
"failure on (.+) due to", data[-1])[0]
|
||||
# Check if host with hearbeat loss failure is the same
|
||||
# as host with previous failure
|
||||
if (re.search(host + " (.+) Heartbeat Loss (.+) "
|
||||
"\\(during recovery soak\\)", line)):
|
||||
old = data[-1]
|
||||
data[-1] = (old[0:23] + line[0:19] + old[42:-1]
|
||||
+ " (recovery over disabled due to "
|
||||
"heartbeat soak failure)\n")
|
||||
else:
|
||||
hb_loss_start = line[0:19]
|
||||
comm_loss = auto_recov_dis = False
|
||||
hb_loss_host = re.findall("Error : (.+) [CM]", line)[0]
|
||||
# Check if previous failure recorded was heartbeat loss due to
|
||||
# missing heartbeat messages
|
||||
elif ("regained MTCALIVE from host that has rebooted" in line
|
||||
and data and re.search(r"Heartbeat loss failure (.+) "
|
||||
r"\(recovery over disabled\)",
|
||||
data[-1])):
|
||||
host = re.findall("failure on (.+) due to", data[-1])[0]
|
||||
if re.search(host + " regained MTCALIVE", line):
|
||||
old = data[-1].split("due", 1)[0]
|
||||
data[-1] = (old[0:23] + line[0:19] + old[42:]
|
||||
+ "due to uncontrolled reboot\n")
|
||||
elif (hb_loss_start and not comm_loss and hb_loss_host and
|
||||
re.search(hb_loss_host + " Loss Of Communication for 5 "
|
||||
"seconds", line)):
|
||||
comm_loss = True
|
||||
elif re.search("mtcClient --- (.+)Error : FAILED:", line):
|
||||
if goenable_start and not goenable_tst_f:
|
||||
goenable_tst_f = re.findall(
|
||||
r"Error : FAILED: (.+) \(\d", line)[0]
|
||||
elif config_start and not config_tst_f:
|
||||
config_tst_f = re.findall(
|
||||
r"Error : FAILED: (.+) \(\d", line)[0]
|
||||
elif (goenable_host and not goenable_end and
|
||||
re.search(goenable_host + " Task: In-Test Failure, "
|
||||
"threshold reached", line)):
|
||||
goenable_end = line[0:19]
|
||||
if goenable_tst_f:
|
||||
data.append(goenable_start + " to " + goenable_end
|
||||
+ " Go-enable test failure on "
|
||||
+ goenable_host + " due to failing of "
|
||||
+ goenable_tst_f + "\n")
|
||||
else:
|
||||
data.append(goenable_start + " to " + goenable_end
|
||||
+ " Go-enable test failure on "
|
||||
+ goenable_host + " due to unknown test "
|
||||
"failing\n")
|
||||
|
||||
goenable_start = goenable_end = goenable_host = None
|
||||
goenable_tst_f = None
|
||||
elif (config_host and not config_end and
|
||||
re.search(config_host + " Task: Configuration failure, "
|
||||
"threshold reached", line)):
|
||||
config_end = datetime.strptime(line[0:19],
|
||||
"%Y-%m-%dT%H:%M:%S")
|
||||
if (config_tst_f
|
||||
!= "/etc/goenabled.d/config_goenabled_check.sh"):
|
||||
try:
|
||||
daemon_fail = self.search_daemon_fail(
|
||||
config_start, config_end, config_host)
|
||||
except FileNotFoundError as e:
|
||||
logger.error(e)
|
||||
|
||||
if (config_tst_f ==
|
||||
"/etc/goenabled.d/config_goenabled_check.sh"
|
||||
or daemon_fail):
|
||||
try:
|
||||
puppet_error = self.search_puppet_error(
|
||||
config_start, config_end)
|
||||
except FileNotFoundError as e:
|
||||
logger.error(e)
|
||||
|
||||
config_start = config_start.strftime(
|
||||
"%Y-%m-%dT%H:%M:%S")
|
||||
config_end = config_end.strftime("%Y-%m-%dT%H:%M:%S")
|
||||
if puppet_error:
|
||||
data.append(config_start + " to " + config_end
|
||||
+ " Configuration failure on "
|
||||
+ config_host + " due to:\n"
|
||||
+ puppet_error)
|
||||
else:
|
||||
data.append(config_start + " to " + config_end
|
||||
+ " Configuration failure on "
|
||||
+ config_host
|
||||
+ " due to unknown cause\n")
|
||||
else:
|
||||
config_start = config_start.strftime(
|
||||
"%Y-%m-%dT%H:%M:%S")
|
||||
config_end = config_end.strftime("%Y-%m-%dT%H:%M:%S")
|
||||
data.append(config_start + " to " + config_end
|
||||
+ " Possible configuration failure on "
|
||||
+ config_host + "\n")
|
||||
|
||||
config_start = config_end = config_host = None
|
||||
config_tst_f = puppet_error = None
|
||||
daemon_fail = False
|
||||
elif (hb_loss_start and not hb_loss_end and hb_loss_host and
|
||||
re.search(hb_loss_host + " Connectivity Recovered ",
|
||||
line)):
|
||||
hb_loss_end = line[0:19]
|
||||
data.append(hb_loss_start + " to " + hb_loss_end
|
||||
+ " Heartbeat loss failure on " + hb_loss_host
|
||||
+ " due to too many missing heartbeat "
|
||||
"messages\n")
|
||||
|
||||
hb_loss_start = hb_loss_end = hb_loss_host = None
|
||||
comm_loss = False
|
||||
elif (hb_loss_start and comm_loss and not hb_loss_end and
|
||||
hb_loss_host and re.search(
|
||||
hb_loss_host + " Graceful Recovery Wait", line)):
|
||||
hb_loss_end = line[0:19]
|
||||
data.append(hb_loss_start + " to " + hb_loss_end
|
||||
+ " Heartbeat loss failure on " + hb_loss_host
|
||||
+ " due to too many missing heartbeat "
|
||||
"messages (recovery over disabled)\n")
|
||||
|
||||
hb_loss_start = hb_loss_end = hb_loss_host = None
|
||||
comm_loss = False
|
||||
|
||||
return data
|
||||
|
||||
def search_hb_loss(self, start_time, end_time, host):
|
||||
"""Searches through the output file created by the heartbeat loss
|
||||
plugin for heartbeat loss message from host between one minute before
|
||||
start_time and end_time
|
||||
|
||||
Errors:
|
||||
FileNotFoundError
|
||||
"""
|
||||
hb_loss = False
|
||||
|
||||
# Open output file from heartbeat loss plugin and read it
|
||||
file_path = os.path.join(self.plugin_output_dir, "heartbeat_loss")
|
||||
|
||||
with open(file_path, "r") as heartbeat_loss:
|
||||
for line in heartbeat_loss:
|
||||
if (re.search("Error : " + host + " (.+) Heartbeat Loss ",
|
||||
line)):
|
||||
date = datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S")
|
||||
if (date >= start_time - timedelta(minutes=1)
|
||||
and date <= end_time):
|
||||
hb_loss = True
|
||||
break
|
||||
|
||||
return hb_loss
|
||||
|
||||
def search_daemon_fail(self, start_time, end_time, host):
|
||||
"""Searches through the output file created by the daemon failures
|
||||
plugin for puppet manifest failed message from host between 10 seconds
|
||||
before start_time and end_time
|
||||
|
||||
Errors:
|
||||
FileNotFoundError
|
||||
"""
|
||||
daemon_fail = False
|
||||
|
||||
# Open output file from daemon failures plugin and read it
|
||||
file_path = os.path.join(self.plugin_output_dir, "daemon_failures")
|
||||
|
||||
with open(file_path, "r") as daemon_failures:
|
||||
for line in daemon_failures:
|
||||
if (re.search("\\d " + host
|
||||
+ " (.+) Failed to run the puppet manifest",
|
||||
line)):
|
||||
date = datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S")
|
||||
if (date >= start_time - timedelta(seconds=10)
|
||||
and date <= end_time):
|
||||
daemon_fail = True
|
||||
break
|
||||
|
||||
return daemon_fail
|
||||
|
||||
def search_puppet_error(self, start_time, end_time):
|
||||
"""Searches through the output file created by the puppet errors
|
||||
plugin for error message between 10 seconds before start_time and
|
||||
end_time and returns it
|
||||
|
||||
Errors:
|
||||
FileNotFoundError
|
||||
"""
|
||||
puppet_log = None
|
||||
|
||||
# Open output file from puppet errors plugin and read it
|
||||
file_path = os.path.join(self.plugin_output_dir, "puppet_errors")
|
||||
|
||||
with open(file_path, "r") as puppet_errors:
|
||||
for line in puppet_errors:
|
||||
if "Error: " in line:
|
||||
date = datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S")
|
||||
if (date >= start_time - timedelta(seconds=10)
|
||||
and date <= end_time):
|
||||
puppet_log = line
|
||||
break
|
||||
|
||||
return puppet_log
|
||||
|
||||
def get_events(self, hostname):
|
||||
"""Searches through the output files created by the plugins for
|
||||
significant events and summarizes them
|
||||
|
||||
Errors:
|
||||
FileNotFoundError
|
||||
"""
|
||||
data = []
|
||||
|
||||
# Open output file from maintenance errors plugin and read it
|
||||
file_path = os.path.join(self.plugin_output_dir, "maintenance_errors")
|
||||
|
||||
with open(file_path, "r") as mtc:
|
||||
for line in mtc:
|
||||
if "force failed by SM" in line:
|
||||
host = re.findall("Error : (.+) is being", line)[0]
|
||||
if hostname == "all" or host == hostname:
|
||||
data.append(line[0:19] + " " + host
|
||||
+ " force failed by SM\n")
|
||||
elif "Graceful Recovery Failed" in line:
|
||||
host = re.findall("Info : (.+) Task:", line)[0]
|
||||
if hostname == "all" or host == hostname:
|
||||
data.append(line[0:19] + " " + host
|
||||
+ " graceful recovery failed\n")
|
||||
|
||||
# Open output file from swact activity plugin and read it
|
||||
file_path = os.path.join(self.plugin_output_dir, "swact_activity")
|
||||
|
||||
with open(file_path, "r") as swact_activity:
|
||||
for line in swact_activity:
|
||||
if (re.search("Service (.+) is failed and has reached max "
|
||||
"failures", line)):
|
||||
host = re.findall(
|
||||
r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3} (.+) sm:",
|
||||
line)[0]
|
||||
svc_failed = re.findall(
|
||||
r"Service \((.+)\) is failed", line)[0]
|
||||
if hostname == "all" or host == hostname:
|
||||
data.append(line[0:19] + " " + host
|
||||
+ " service failure (" + svc_failed
|
||||
+ ")\n")
|
||||
|
||||
return data
|
||||
|
||||
def get_state_changes(self, hostname):
|
||||
"""Searches through the output files created by the state changes
|
||||
plugin and summarizes the changes of state of the hosts
|
||||
|
||||
Errors:
|
||||
FileNotFoundError
|
||||
"""
|
||||
data = []
|
||||
|
||||
# Open output file from state changes plugin and read it
|
||||
file_path = os.path.join(self.plugin_output_dir, "state_changes")
|
||||
|
||||
with open(file_path, "r") as state_changes:
|
||||
for line in state_changes:
|
||||
if "is ENABLED" in line:
|
||||
host = re.findall("Info : (.+) is ENABLED", line)[0]
|
||||
state = re.findall("is (.+)\n", line)[0].lower()
|
||||
if hostname == "all" or hostname in host:
|
||||
data.append(line[0:19] + " " + host + " " + state
|
||||
+ "\n")
|
||||
elif "locked-disabled" in line:
|
||||
host = re.findall(
|
||||
"Info : (.+) u?n?locked-disabled", line)[0]
|
||||
if hostname == "all" or host == hostname:
|
||||
data.append(line[0:19] + " " + host + " disabled\n")
|
||||
|
||||
return data
|
|
@ -21,15 +21,17 @@ import os
|
|||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import tarfile
|
||||
|
||||
import algorithms
|
||||
from correlator import Correlator
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ExecutionEngine:
|
||||
def __init__(self, opts):
|
||||
def __init__(self, opts, output_directory):
|
||||
"""Constructor for the ExecutionEngine class
|
||||
|
||||
Parameters:
|
||||
|
@ -39,6 +41,23 @@ class ExecutionEngine:
|
|||
self.hosts = {"controllers": {}, "workers": {}, "storages": {}}
|
||||
self.active_controller_directory = None
|
||||
|
||||
# Uncompresses host tar files if not already done
|
||||
with open(os.path.join(output_directory, "untar.log"), "a") as logfile:
|
||||
for obj in (os.scandir(self.opts.directory)):
|
||||
info = os.path.splitext(obj.name)
|
||||
if (obj.is_file() and obj.name != "report_tool.tgz" and
|
||||
tarfile.is_tarfile(obj.path) and not
|
||||
os.path.isdir(os.path.join(self.opts.directory,
|
||||
info[0]))):
|
||||
try:
|
||||
subprocess.run(["tar", "xzfC", obj.path,
|
||||
self.opts.directory],
|
||||
stderr=logfile, check=True)
|
||||
subprocess.run(["echo", "uncompressed", obj.name],
|
||||
check=True)
|
||||
except subprocess.CalledProcessError as e:
|
||||
logger.error(e)
|
||||
|
||||
for folder in (f.path for f in os.scandir(self.opts.directory)):
|
||||
database_path = os.path.join(folder, "var", "extra", "database")
|
||||
host_info_path = os.path.join(folder, "var", "extra", "host.info")
|
||||
|
@ -47,7 +66,8 @@ class ExecutionEngine:
|
|||
self.active_controller_directory = folder
|
||||
|
||||
if os.path.exists(host_info_path):
|
||||
hostname, subfunction = self._extract_subfunction(host_info_path)
|
||||
hostname, subfunction = self._extract_subfunction(
|
||||
host_info_path)
|
||||
if "controller" in subfunction:
|
||||
self.hosts["controllers"][hostname] = folder
|
||||
elif "worker" in subfunction:
|
||||
|
@ -67,13 +87,18 @@ class ExecutionEngine:
|
|||
Errors:
|
||||
FileNotFoundError
|
||||
"""
|
||||
plugin_output_dir = os.path.join(output_directory, "plugins")
|
||||
os.makedirs(plugin_output_dir, exist_ok=True)
|
||||
|
||||
for plugin in plugins:
|
||||
logger.info(f"Processing plugin: {os.path.basename(plugin.file)}")
|
||||
processing = "Processing plugin: " + os.path.basename(plugin.file)
|
||||
hosts = {}
|
||||
if (
|
||||
plugin.state["hosts"] and len(plugin.state["hosts"]) >= 1
|
||||
): # if host list is given
|
||||
logger.info(
|
||||
f"Processing plugin: {os.path.basename(plugin.file)}")
|
||||
|
||||
for h in plugin.state["hosts"]:
|
||||
if h == "all":
|
||||
hosts.update(self.hosts["workers"])
|
||||
|
@ -86,44 +111,52 @@ class ExecutionEngine:
|
|||
|
||||
events = []
|
||||
if plugin.state["algorithm"] == algorithms.SUBSTRING:
|
||||
try:
|
||||
events = self.substring(
|
||||
plugin.state["substring"],
|
||||
[
|
||||
os.path.join(folderpath, file)
|
||||
for file in plugin.state["files"]
|
||||
],
|
||||
)
|
||||
except FileNotFoundError as e:
|
||||
logger.error(e)
|
||||
continue
|
||||
events = self.substring(
|
||||
plugin.state["substring"],
|
||||
[
|
||||
os.path.join(folderpath, file)
|
||||
for file in plugin.state["files"]
|
||||
],
|
||||
)
|
||||
|
||||
# creating output file
|
||||
output_file = os.path.join(
|
||||
output_directory,
|
||||
f"{hostname}_{os.path.basename(plugin.file)}_{plugin.state['algorithm']}",
|
||||
plugin_output_dir,
|
||||
f"substring_{hostname}",
|
||||
)
|
||||
logger.info("output at " + output_file)
|
||||
if self.opts.verbose:
|
||||
logger.info("output at "
|
||||
+ os.path.relpath(output_file))
|
||||
with open(output_file, "w") as file:
|
||||
file.write(
|
||||
f"Date range: {self.opts.start} until {self.opts.end}\n"
|
||||
f"Date range: {self.opts.start} until "
|
||||
f"{self.opts.end}\n"
|
||||
)
|
||||
file.write(
|
||||
f"substrings: {' '.join(plugin.state['substring'])}\n"
|
||||
f"substrings: "
|
||||
f"{' '.join(plugin.state['substring'])}\n"
|
||||
)
|
||||
for line in events:
|
||||
file.write(line + "\n")
|
||||
if line[-1] == "\n":
|
||||
file.write(line)
|
||||
else:
|
||||
file.write(line + "\n")
|
||||
else:
|
||||
if plugin.state["algorithm"] == algorithms.SYSTEM_INFO:
|
||||
info = self.system_info()
|
||||
system_info_output = os.path.join(output_directory, "system_info")
|
||||
system_info_output = os.path.join(plugin_output_dir,
|
||||
"system_info")
|
||||
with open(system_info_output, "w") as file:
|
||||
for i in info:
|
||||
file.write(i + "\n")
|
||||
|
||||
for k, v in self.hosts.items():
|
||||
file.write(f"{k}: {','.join(v.keys())}\n")
|
||||
logger.info("output at " + system_info_output)
|
||||
if self.opts.verbose:
|
||||
logger.info(processing + ", output at "
|
||||
+ os.path.relpath(system_info_output))
|
||||
else:
|
||||
logger.info(processing)
|
||||
|
||||
elif plugin.state["algorithm"] == algorithms.AUDIT:
|
||||
hosts = {}
|
||||
|
@ -134,38 +167,42 @@ class ExecutionEngine:
|
|||
for hostname, folderpath in hosts.items():
|
||||
self._create_output_file(
|
||||
f"{hostname}_audit",
|
||||
output_directory,
|
||||
plugin_output_dir,
|
||||
self.audit(
|
||||
plugin.state["start"],
|
||||
plugin.state["end"],
|
||||
os.path.join(
|
||||
folderpath, "var", "log", "dcmanager", "audit.log"
|
||||
folderpath, "var", "log", "dcmanager",
|
||||
"audit.log"
|
||||
),
|
||||
),
|
||||
processing,
|
||||
)
|
||||
|
||||
elif plugin.state["algorithm"] == algorithms.SWACT:
|
||||
elif plugin.state["algorithm"] == algorithms.SWACT_ACTIVITY:
|
||||
self._create_output_file(
|
||||
"swact_activity", output_directory, self.swact()
|
||||
"swact_activity", plugin_output_dir,
|
||||
self.swact_activity(), processing
|
||||
)
|
||||
|
||||
elif plugin.state["algorithm"] == algorithms.PUPPET:
|
||||
elif plugin.state["algorithm"] == algorithms.PUPPET_ERRORS:
|
||||
self._create_output_file(
|
||||
"puppet_errors", output_directory, self.puppet()
|
||||
"puppet_errors", plugin_output_dir,
|
||||
self.puppet_errors(), processing
|
||||
)
|
||||
|
||||
elif plugin.state["algorithm"] == algorithms.PROCESS_FAILURE:
|
||||
elif plugin.state["algorithm"] == algorithms.PROCESS_FAILURES:
|
||||
self._create_output_file(
|
||||
"process_failures", output_directory, self.process_failure()
|
||||
"process_failures", plugin_output_dir,
|
||||
self.process_failures(), processing
|
||||
)
|
||||
|
||||
elif plugin.state["algorithm"] == algorithms.ALARM:
|
||||
alarms, logs = self.alarm(
|
||||
plugin.state["alarm_ids"], plugin.state["entity_ids"]
|
||||
)
|
||||
alarm_output = os.path.join(output_directory, "alarm")
|
||||
log_output = os.path.join(output_directory, "log")
|
||||
os.makedirs(os.path.dirname(log_output), exist_ok=True)
|
||||
alarm_output = os.path.join(plugin_output_dir, "alarm")
|
||||
log_output = os.path.join(plugin_output_dir, "log")
|
||||
|
||||
# creating output alarm file
|
||||
with open(alarm_output, "w") as file:
|
||||
|
@ -186,8 +223,39 @@ class ExecutionEngine:
|
|||
file.write(f"{k}\n")
|
||||
for date in v["dates"]:
|
||||
file.write(f" {date}\n")
|
||||
logger.info("output at " + alarm_output)
|
||||
logger.info("output at " + log_output)
|
||||
if self.opts.verbose:
|
||||
logger.info(processing + ", output at "
|
||||
+ os.path.relpath(alarm_output)
|
||||
+ ", " + os.path.relpath(log_output))
|
||||
else:
|
||||
logger.info(processing)
|
||||
elif plugin.state["algorithm"] == algorithms.HEARTBEAT_LOSS:
|
||||
self._create_output_file(
|
||||
"heartbeat_loss", plugin_output_dir,
|
||||
self.heartbeat_loss(), processing
|
||||
)
|
||||
elif plugin.state["algorithm"] == algorithms.MAINTENANCE_ERR:
|
||||
self._create_output_file(
|
||||
"maintenance_errors", plugin_output_dir,
|
||||
self.maintenance_errors(), processing
|
||||
)
|
||||
elif plugin.state["algorithm"] == algorithms.DAEMON_FAILURES:
|
||||
self._create_output_file(
|
||||
"daemon_failures", plugin_output_dir,
|
||||
self.daemon_failures(), processing
|
||||
)
|
||||
elif plugin.state["algorithm"] == algorithms.STATE_CHANGES:
|
||||
self._create_output_file(
|
||||
"state_changes", plugin_output_dir,
|
||||
self.state_changes(), processing
|
||||
)
|
||||
|
||||
if not self.opts.verbose:
|
||||
logger.info("Output files for plugins can be found at " +
|
||||
os.path.relpath(plugin_output_dir))
|
||||
|
||||
# Running the correlator and printing the output from it
|
||||
self.run_correlator(output_directory, plugin_output_dir)
|
||||
|
||||
# Built-in algorithms ------------------------------
|
||||
def alarm(self, alarm_ids=[], entity_ids=[]):
|
||||
|
@ -234,8 +302,10 @@ class ExecutionEngine:
|
|||
entry_date = alarm_date.replace(
|
||||
" ", "T"
|
||||
) # making time format of alarm the same
|
||||
if self.opts.start <= entry_date and entry_date <= self.opts.end:
|
||||
# if the alarm is not in the user specified list of alarm or entity ids
|
||||
if (self.opts.start <= entry_date
|
||||
and entry_date <= self.opts.end):
|
||||
# if the alarm is not in the user specified list of
|
||||
# alarm or entity ids
|
||||
for id in alarm_ids:
|
||||
if id in alarm_id:
|
||||
break
|
||||
|
@ -262,15 +332,20 @@ class ExecutionEngine:
|
|||
f"{alarm_id} {entity_id} {severity}"
|
||||
]
|
||||
alarm_info["count"] += 1
|
||||
alarm_info["dates"].append(f"{alarm_date} {action}")
|
||||
alarm_info["dates"].append(
|
||||
f"{alarm_date} {action}")
|
||||
except KeyError:
|
||||
if entry[6] != "log":
|
||||
alarm_data[f"{alarm_id} {entity_id} {severity}"] = {
|
||||
alarm_data[
|
||||
f"{alarm_id} {entity_id} {severity}"
|
||||
] = {
|
||||
"count": 1,
|
||||
"dates": [f"{alarm_date} {action}"],
|
||||
}
|
||||
else:
|
||||
log_data[f"{alarm_id} {entity_id} {severity}"] = {
|
||||
log_data[
|
||||
f"{alarm_id} {entity_id} {severity}"
|
||||
] = {
|
||||
"count": 1,
|
||||
"dates": [alarm_date],
|
||||
}
|
||||
|
@ -294,37 +369,64 @@ class ExecutionEngine:
|
|||
Errors:
|
||||
FileNotFoundError
|
||||
"""
|
||||
CONTINUE_CURRENT = 0 # don't analyze older files, continue with current file
|
||||
CONTINUE_CURRENT_OLD = 1 # analyze older files, continue with current file
|
||||
# don't analyze older files, continue with current file
|
||||
CONTINUE_CURRENT = 0
|
||||
# analyze older files, continue with current file
|
||||
CONTINUE_CURRENT_OLD = 1
|
||||
|
||||
data = []
|
||||
for file in files:
|
||||
if not os.path.exists(file):
|
||||
raise FileNotFoundError(f"File not found: {file}")
|
||||
cont = True
|
||||
# Searching through file
|
||||
command = f"""grep -Ea "{'|'.join(s for s in substr)}" {file}"""
|
||||
status = self._continue(file)
|
||||
try:
|
||||
if not os.path.exists(file):
|
||||
if (re.search("controller-1_(.+)/var/log/mtcAgent.log",
|
||||
file)):
|
||||
continue
|
||||
raise FileNotFoundError(f"File not found: {file}")
|
||||
cont = True
|
||||
# Searching through file
|
||||
command = (f"""grep -Ea "{'|'.join(s for s in substr)}" """
|
||||
f"""{file} 2>/dev/null""")
|
||||
status = self._continue(file)
|
||||
|
||||
if (
|
||||
status == CONTINUE_CURRENT or status == CONTINUE_CURRENT_OLD
|
||||
): # continue with current file
|
||||
if status == CONTINUE_CURRENT:
|
||||
cont = False
|
||||
self._evaluate_substring(data, command)
|
||||
|
||||
# Searching through rotated log files
|
||||
n = 1
|
||||
while os.path.exists(f"{file}.{n}.gz") and cont:
|
||||
command = f"""zgrep -E "{'|'.join(s for s in substr)}" {file}.{n}.gz"""
|
||||
status = self._continue(f"{file}.{n}.gz", compressed=True)
|
||||
|
||||
if status == CONTINUE_CURRENT or status == CONTINUE_CURRENT_OLD:
|
||||
if (status == CONTINUE_CURRENT
|
||||
or status == CONTINUE_CURRENT_OLD):
|
||||
# continue with current file
|
||||
if status == CONTINUE_CURRENT:
|
||||
cont = False
|
||||
self._evaluate_substring(data, command)
|
||||
|
||||
n += 1
|
||||
# Searching through rotated log files that aren't compressed
|
||||
n = 1
|
||||
while os.path.exists(f"{file}.{n}") and cont:
|
||||
command = (f"""grep -Ea "{'|'.join(s for s in substr)}" """
|
||||
f"""{file}.{n} 2>/dev/null""")
|
||||
status = self._continue(f"{file}.{n}")
|
||||
|
||||
if (status == CONTINUE_CURRENT
|
||||
or status == CONTINUE_CURRENT_OLD):
|
||||
if status == CONTINUE_CURRENT:
|
||||
cont = False
|
||||
self._evaluate_substring(data, command)
|
||||
|
||||
n += 1
|
||||
|
||||
# Searching through rotated log files
|
||||
while os.path.exists(f"{file}.{n}.gz") and cont:
|
||||
command = (f"""zgrep -E "{'|'.join(s for s in substr)}" """
|
||||
f"""{file}.{n}.gz 2>/dev/null""")
|
||||
status = self._continue(f"{file}.{n}.gz", compressed=True)
|
||||
|
||||
if (status == CONTINUE_CURRENT
|
||||
or status == CONTINUE_CURRENT_OLD):
|
||||
if status == CONTINUE_CURRENT:
|
||||
cont = False
|
||||
self._evaluate_substring(data, command)
|
||||
|
||||
n += 1
|
||||
|
||||
except FileNotFoundError as e:
|
||||
logger.error(e)
|
||||
continue
|
||||
|
||||
return sorted(data)
|
||||
|
||||
|
@ -335,25 +437,29 @@ class ExecutionEngine:
|
|||
data = []
|
||||
with open(
|
||||
os.path.join(
|
||||
self.active_controller_directory, "etc", "platform", "platform.conf"
|
||||
self.active_controller_directory, "etc", "platform",
|
||||
"platform.conf"
|
||||
)
|
||||
) as file:
|
||||
for line in file:
|
||||
if "system_mode" in line:
|
||||
data.append(
|
||||
f"System Mode: {re.match('^system_mode=(.*)', line).group(1)}"
|
||||
f"System Mode: "
|
||||
f"{re.match('^system_mode=(.*)', line).group(1)}"
|
||||
)
|
||||
elif "system_type" in line:
|
||||
data.append(
|
||||
f"System Type: {re.match('^system_type=(.*)', line).group(1)}"
|
||||
f"System Type: "
|
||||
f"{re.match('^system_type=(.*)', line).group(1)}"
|
||||
)
|
||||
elif "distributed_cloud_role" in line:
|
||||
data.append(
|
||||
f"Distributed cloud role: {re.match('^distributed_cloud_role=(.*)', line).group(1)}"
|
||||
)
|
||||
role = re.match('^distributed_cloud_role=(.*)',
|
||||
line).group(1)
|
||||
data.append(f"Distributed cloud role: {role}")
|
||||
elif "sw_version" in line:
|
||||
data.append(
|
||||
f"SW Version: {re.match('^sw_version=(.*)', line).group(1)}"
|
||||
f"SW Version: "
|
||||
f"{re.match('^sw_version=(.*)', line).group(1)}"
|
||||
)
|
||||
with open(
|
||||
os.path.join(self.active_controller_directory, "etc", "build.info")
|
||||
|
@ -361,14 +467,15 @@ class ExecutionEngine:
|
|||
for line in file:
|
||||
if "BUILD_TYPE" in line:
|
||||
data.append(
|
||||
f"Build Type: {re.match('^BUILD_TYPE=(.*)', line).group(1)}"
|
||||
f"Build Type: "
|
||||
f"{re.match('^BUILD_TYPE=(.*)', line).group(1)}"
|
||||
)
|
||||
elif re.match("^OS=(.*)", line):
|
||||
data.append(f"OS: {re.match('^OS=(.*)', line).group(1)}")
|
||||
|
||||
return data
|
||||
|
||||
def swact(self):
|
||||
def swact_activity(self):
|
||||
"""Swact activity algorithm
|
||||
Presents all swacting activity in the system
|
||||
"""
|
||||
|
@ -382,42 +489,49 @@ class ExecutionEngine:
|
|||
for _, folder in self.hosts["controllers"].items():
|
||||
sm_path = os.path.join(folder, "var", "log", "sm.log")
|
||||
sm_files.append(sm_path)
|
||||
sm_customer_path = os.path.join(folder, "var", "log",
|
||||
"sm-customer.log")
|
||||
sm_customer_files.append(sm_customer_path)
|
||||
|
||||
sm_substrings = ["Swact has started,", "Swact update"]
|
||||
sm_substrings = ["Uncontrolled swact", "Swact has started,",
|
||||
"Neighbor (.+) is now in the down",
|
||||
"Service (.+) has reached max failures",
|
||||
"Swact update"]
|
||||
data = self.substring(sm_substrings, sm_files)
|
||||
|
||||
for i, line in enumerate(data):
|
||||
if "Swact has started," in line and not swact_in_progress:
|
||||
swact_in_progress = True
|
||||
swact_start = datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S")
|
||||
swact_start = datetime.strptime(line[0:19],
|
||||
"%Y-%m-%dT%H:%M:%S")
|
||||
elif "Swact update" in line and swact_in_progress:
|
||||
swact_in_progress = False
|
||||
swact_end = datetime.strptime(line[0:19], "%Y-%m-%dT%H:%M:%S")
|
||||
line += f" SWACT TOOK {swact_end - swact_start} \n"
|
||||
data[i] = line
|
||||
|
||||
for _, folder in self.hosts["controllers"].items():
|
||||
sm_customer_path = os.path.join(folder, "var", "log", "sm-customer.log")
|
||||
sm_customer_files.append(sm_customer_path)
|
||||
|
||||
sm_customer_substrings = ["swact"]
|
||||
sm_customer_substrings = [
|
||||
"swact", "active-failed\\s+\\| disabling-failed\\s+\\|"
|
||||
]
|
||||
data += self.substring(sm_customer_substrings, sm_customer_files)
|
||||
|
||||
return sorted(data)
|
||||
|
||||
def puppet(self):
|
||||
"""Puppet error algorithm
|
||||
def puppet_errors(self):
|
||||
"""Puppet errors algorithm
|
||||
Presents log errors from puppet logs
|
||||
"""
|
||||
data = []
|
||||
for _, folder in self.hosts["controllers"].items():
|
||||
puppet_folder = os.path.join(folder, "var", "log", "puppet")
|
||||
command = f"grep -rh 'Error:' {puppet_folder}"
|
||||
self._evaluate_substring(data, command)
|
||||
for host_type in self.hosts.keys():
|
||||
for _, folder in self.hosts[host_type].items():
|
||||
puppet_folder = os.path.join(folder, "var", "log", "puppet")
|
||||
command = (f"""grep -rh "[m ]Error: " {puppet_folder} """
|
||||
f"""2>/dev/null""")
|
||||
self._evaluate_substring(data, command)
|
||||
return sorted(data)
|
||||
|
||||
def process_failure(self):
|
||||
"""Process failure algorithm
|
||||
def process_failures(self):
|
||||
"""Process failures algorithm
|
||||
Presents log errors from pmond
|
||||
"""
|
||||
data = []
|
||||
|
@ -426,9 +540,86 @@ class ExecutionEngine:
|
|||
for _, folder in self.hosts[host_type].items():
|
||||
pmond = os.path.join(folder, "var", "log", "pmond.log")
|
||||
files.append(pmond)
|
||||
|
||||
data = self.substring(["Error :"], files)
|
||||
|
||||
return data
|
||||
|
||||
def heartbeat_loss(self):
|
||||
"""Heartbeat loss algorithm
|
||||
Presents all heartbeat loss error messages in the system
|
||||
"""
|
||||
data = []
|
||||
hb_files = []
|
||||
|
||||
for _, folder in self.hosts["controllers"].items():
|
||||
hb_path = os.path.join(folder, "var", "log", "hbsAgent.log")
|
||||
hb_files.append(hb_path)
|
||||
|
||||
hb_substrings = ["Heartbeat Loss"]
|
||||
data = self.substring(hb_substrings, hb_files)
|
||||
|
||||
return sorted(data)
|
||||
|
||||
def maintenance_errors(self):
|
||||
"""Maintenance errors algorithm
|
||||
Presents maintenance errors and other relevant log messages in system
|
||||
"""
|
||||
data = []
|
||||
mtc_files = []
|
||||
|
||||
for _, folder in self.hosts["controllers"].items():
|
||||
agent = os.path.join(folder, "var", "log", "mtcAgent.log")
|
||||
mtc_files.append(agent)
|
||||
|
||||
for host_type in self.hosts.keys():
|
||||
for _, folder in self.hosts[host_type].items():
|
||||
client = os.path.join(folder, "var", "log", "mtcClient.log")
|
||||
mtc_files.append(client)
|
||||
|
||||
mtc_substrings = ["Error : ", "Configuration failure",
|
||||
"In-Test Failure", "Loss Of Communication",
|
||||
"Graceful Recovery Wait ",
|
||||
"regained MTCALIVE from host that has rebooted",
|
||||
"Connectivity Recovered ; ",
|
||||
"auto recovery disabled", "Graceful Recovery Failed"]
|
||||
data = self.substring(mtc_substrings, mtc_files)
|
||||
|
||||
return sorted(data)
|
||||
|
||||
def daemon_failures(self):
|
||||
"""Daemon failures algorithm
|
||||
Presents all failed puppet manifest messages in the system
|
||||
"""
|
||||
data = []
|
||||
daemon_files = []
|
||||
|
||||
for host_type in self.hosts.keys():
|
||||
for _, folder in self.hosts[host_type].items():
|
||||
daemon_path = os.path.join(folder, "var", "log", "daemon.log")
|
||||
daemon_files.append(daemon_path)
|
||||
|
||||
daemon_substrings = ["Failed to run the puppet manifest"]
|
||||
data = self.substring(daemon_substrings, daemon_files)
|
||||
|
||||
return sorted(data)
|
||||
|
||||
def state_changes(self):
|
||||
"""State changes algorithm
|
||||
Presents all messages in the system regarding the state of hosts
|
||||
"""
|
||||
data = []
|
||||
sc_files = []
|
||||
|
||||
for _, folder in self.hosts["controllers"].items():
|
||||
sc_path = os.path.join(folder, "var", "log", "mtcAgent.log")
|
||||
sc_files.append(sc_path)
|
||||
|
||||
sc_substrings = ["is ENABLED", "allStateChange (.+)locked-disabled"]
|
||||
data = self.substring(sc_substrings, sc_files)
|
||||
|
||||
return sorted(data)
|
||||
|
||||
def audit(self, start, end, audit_log_path):
|
||||
"""Counts audit events in dcmanager within a specified date range
|
||||
|
||||
|
@ -456,23 +647,29 @@ class ExecutionEngine:
|
|||
# Counts sum of audits from all subclouds
|
||||
]
|
||||
INDEX_MIDDLE_WORD = 1
|
||||
data = ["These rates and totals represent the sum of audits from all subclouds"]
|
||||
data = [("These rates and totals represent the sum of audits from "
|
||||
+ "all subclouds")]
|
||||
|
||||
def command(text):
|
||||
|
||||
return (
|
||||
f'lnav -R -n -c ";SELECT count(log_body) AS {text.split(" ")[INDEX_MIDDLE_WORD]}_total'
|
||||
f' from openstack_log WHERE (log_time > \\"{start}\\" AND not log_time > \\"{end}\\")'
|
||||
f'lnav -R -n -c ";SELECT count(log_body) AS '
|
||||
f'{text.split(" ")[INDEX_MIDDLE_WORD]}_total from '
|
||||
f'openstack_log WHERE '
|
||||
f'(log_time > \\"{start}\\" AND not log_time > \\"{end}\\")'
|
||||
f' AND log_body like \\"{text}\\"" "{audit_log_path}"'
|
||||
)
|
||||
|
||||
for text in log_texts:
|
||||
p = subprocess.Popen(command(text), shell=True, stdout=subprocess.PIPE)
|
||||
p = subprocess.Popen(command(text), shell=True,
|
||||
stdout=subprocess.PIPE)
|
||||
for line in p.stdout:
|
||||
line = line.decode("utf-8").strip()
|
||||
if line.isnumeric():
|
||||
data.append(
|
||||
f"rate {round((int(line)/seconds * SECONDS_PER_HOUR), 3)} per hour. total: {line}"
|
||||
f"rate "
|
||||
f"{round((int(line)/seconds * SECONDS_PER_HOUR), 3)} "
|
||||
f"per hour. total: {line}"
|
||||
)
|
||||
else:
|
||||
data.append(line)
|
||||
|
@ -480,12 +677,107 @@ class ExecutionEngine:
|
|||
|
||||
# -----------------------------------
|
||||
|
||||
def _continue(self, file, compressed=False):
|
||||
CONTINUE_CURRENT = 0 # don't analyze older files, continue with current file
|
||||
CONTINUE_CURRENT_OLD = 1 # analyze older files, continue with current file
|
||||
CONTINUE_OLD = 2 # don't analyze current file, continue to older files
|
||||
def run_correlator(self, output_directory, plugin_output_dir):
|
||||
"""Runs the correlator and prints the results differently based on if
|
||||
the tool was run with or without the verbose option
|
||||
|
||||
# check date of first log event and compare with provided start end dates
|
||||
Parameters:
|
||||
output_directory (string) : directory to place output files from
|
||||
correlator
|
||||
plugin_output_dir (string) : directory with output files from
|
||||
plugins
|
||||
"""
|
||||
correlator = Correlator(plugin_output_dir)
|
||||
failures, events, state_changes = correlator.run(self.opts.hostname)
|
||||
failures_len = len(failures)
|
||||
events_len, state_changes_len = len(events), len(state_changes)
|
||||
failures.append("\nTotal failures found: " + str(failures_len) + "\n")
|
||||
events.append("\nTotal events found: " + str(events_len) + "\n")
|
||||
state_changes.append("\nTotal state changes found: "
|
||||
+ str(state_changes_len) + "\n")
|
||||
|
||||
logger.info("\nRunning correlator...")
|
||||
self._create_output_file("correlator_failures", output_directory,
|
||||
failures, "")
|
||||
self._create_output_file("correlator_events", output_directory,
|
||||
events, "")
|
||||
self._create_output_file("correlator_state_changes", output_directory,
|
||||
state_changes, "")
|
||||
|
||||
if not self.opts.verbose:
|
||||
logger.info("Output can be found at "
|
||||
+ os.path.relpath(output_directory) + "\n")
|
||||
logger.info("Failures: " + str(failures_len))
|
||||
for f in failures[:-1]:
|
||||
if "Uncontrolled swact" in f:
|
||||
logger.info(f[0:19] + " "
|
||||
+ re.findall("active controller:? (.+)\n",
|
||||
f)[0] + " uncontrolled swact")
|
||||
elif "failure on" in f:
|
||||
host = re.findall(r"failure on ([^\s]+) ", f)[0]
|
||||
logger.info(f[0:19] + " " + host + " "
|
||||
+ re.findall("^(.+) failure on ",
|
||||
f[43:])[0].lower() + " failure")
|
||||
else:
|
||||
logger.info(f[:-1])
|
||||
if failures_len != 0:
|
||||
logger.info("\nEvents: " + str(events_len))
|
||||
else:
|
||||
logger.info("Events: " + str(events_len))
|
||||
logger.info("State Changes: " + str(state_changes_len))
|
||||
else:
|
||||
logger.info("\nFailures: " + str(failures_len))
|
||||
for f in failures[:-1]:
|
||||
logger.info(f[:-1])
|
||||
|
||||
# Dictionary to keep track of number of times events happens on
|
||||
# each host
|
||||
events_summ = {}
|
||||
for e in events[:-1]:
|
||||
k = e[20:-1]
|
||||
if "service failure" in k:
|
||||
k = k.split(" (", 1)[0]
|
||||
if not events_summ.get(k):
|
||||
events_summ[k] = 1
|
||||
else:
|
||||
events_summ[k] += 1
|
||||
|
||||
if failures_len != 0:
|
||||
logger.info("\nEvents: " + str(events_len))
|
||||
else:
|
||||
logger.info("Events: " + str(events_len))
|
||||
for k, v in sorted(events_summ.items()):
|
||||
logger.info(k + ": " + str(v) + " time(s)")
|
||||
|
||||
# Dictionary to keep track of number of times state changes
|
||||
# happens on each host
|
||||
state_changes_summ = {}
|
||||
for s in state_changes[:-1]:
|
||||
k = s[20:-1]
|
||||
if "enabled" in k:
|
||||
k = k.split("enabled", 1)[0] + "enabled"
|
||||
if not state_changes_summ.get(k):
|
||||
state_changes_summ[k] = 1
|
||||
else:
|
||||
state_changes_summ[k] += 1
|
||||
|
||||
if events_len != 0:
|
||||
logger.info("\nState Changes: " + str(state_changes_len))
|
||||
else:
|
||||
logger.info("State Changes: " + str(state_changes_len))
|
||||
for k, v in sorted(state_changes_summ.items()):
|
||||
logger.info(k + ": " + str(v) + " time(s)")
|
||||
|
||||
def _continue(self, file, compressed=False):
|
||||
# don't analyze older files, continue with current file
|
||||
CONTINUE_CURRENT = 0
|
||||
# analyze older files, continue with current file
|
||||
CONTINUE_CURRENT_OLD = 1
|
||||
# don't analyze current file, continue to older files
|
||||
CONTINUE_OLD = 2
|
||||
|
||||
# check date of first log event and compare with provided
|
||||
# start, end dates
|
||||
first = ""
|
||||
|
||||
if not compressed:
|
||||
|
@ -513,14 +805,15 @@ class ExecutionEngine:
|
|||
p = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
|
||||
for line in p.stdout:
|
||||
line = line.decode("utf-8")
|
||||
dates = [line[0:19], line[2:21]] # different date locations for log events
|
||||
# different date locations for log events
|
||||
dates = [line[0:19], line[2:21]]
|
||||
for date in dates:
|
||||
try:
|
||||
datetime.strptime(date, "%Y-%m-%dT%H:%M:%S")
|
||||
if date > self.opts.start and date < self.opts.end:
|
||||
if line[0] == "|": # sm-customer.log edge case
|
||||
line = line.replace("|", "").strip()
|
||||
line = re.sub("\s+", " ", line)
|
||||
line = line[1:].strip()
|
||||
line = re.sub("\\s+", " ", line)
|
||||
data.append(line)
|
||||
break
|
||||
except ValueError:
|
||||
|
@ -531,16 +824,29 @@ class ExecutionEngine:
|
|||
GROUP_ONE = 1
|
||||
with open(host_info_path) as file:
|
||||
for line in file:
|
||||
hostname_match = re.match("^hostname => (.+)", line)
|
||||
subfunction_match = re.match("^subfunction => (.+)", line)
|
||||
hostname_match = re.match(
|
||||
r"\s*hostname =>\s*\"?([^\"]*)(\n|\"\s*,?\s*\n)", line)
|
||||
subfunction_match = re.match(
|
||||
r"\s*subfunction =>\s*\"?([^\"]*)(\n|\"\s*,?\s*\n)", line)
|
||||
if subfunction_match:
|
||||
subfunction = subfunction_match.group(GROUP_ONE)
|
||||
if hostname_match:
|
||||
hostname = hostname_match.group(GROUP_ONE)
|
||||
return hostname, subfunction
|
||||
|
||||
def _create_output_file(self, filename, directory, events):
|
||||
def _create_output_file(self, filename, directory, data, processing):
|
||||
with open(os.path.join(directory, filename), "w") as file:
|
||||
for i in events:
|
||||
file.write(i + "\n")
|
||||
logger.info("output at " + os.path.join(directory, filename))
|
||||
for i in data:
|
||||
if i[-1] == "\n":
|
||||
file.write(i)
|
||||
else:
|
||||
file.write(i + "\n")
|
||||
if self.opts.verbose:
|
||||
output = ("output at "
|
||||
+ os.path.relpath(os.path.join(directory, filename)))
|
||||
if processing == "":
|
||||
logger.info(output)
|
||||
else:
|
||||
logger.info(processing + ", " + output)
|
||||
elif processing != "":
|
||||
logger.info(processing)
|
||||
|
|
|
@ -121,69 +121,106 @@ class Plugin:
|
|||
if self.state["algorithm"] == algorithms.SUBSTRING:
|
||||
if len(self.state["files"]) == 0:
|
||||
raise ValueError(
|
||||
f"plugin: {plugin_name} needs files specified for substring algorithm"
|
||||
f"plugin: {plugin_name} needs files specified for "
|
||||
f"substring algorithm"
|
||||
)
|
||||
if len(self.state["hosts"]) == 0:
|
||||
raise ValueError(
|
||||
f"plugin: {plugin_name} needs hosts specified for substring algorithm"
|
||||
f"plugin: {plugin_name} needs hosts specified for "
|
||||
f"substring algorithm"
|
||||
)
|
||||
if len(self.state["substring"]) == 0:
|
||||
raise ValueError(
|
||||
f"plugin: {plugin_name} need substring specified for substring algorithm"
|
||||
f"plugin: {plugin_name} need substring specified for "
|
||||
f"substring algorithm"
|
||||
)
|
||||
elif self.state["algorithm"] == algorithms.ALARM:
|
||||
if len(self.state["hosts"]) > 0:
|
||||
raise ValueError(
|
||||
f"plugin: {plugin_name} should not have hosts to be specified"
|
||||
f"plugin: {plugin_name} should not have hosts to be "
|
||||
f"specified"
|
||||
)
|
||||
elif self.state["algorithm"] == algorithms.SYSTEM_INFO:
|
||||
if len(self.state["hosts"]) > 0:
|
||||
raise ValueError(
|
||||
f"plugin: {plugin_name} should not have hosts to be specified"
|
||||
f"plugin: {plugin_name} should not have hosts to be "
|
||||
f"specified"
|
||||
)
|
||||
elif self.state["algorithm"] == algorithms.SWACT:
|
||||
elif self.state["algorithm"] == algorithms.SWACT_ACTIVITY:
|
||||
if len(self.state["hosts"]) > 0:
|
||||
raise ValueError(
|
||||
f"plugin: {plugin_name} should not have hosts to be specified"
|
||||
f"plugin: {plugin_name} should not have hosts to be "
|
||||
f"specified"
|
||||
)
|
||||
elif self.state["algorithm"] == algorithms.PUPPET:
|
||||
elif self.state["algorithm"] == algorithms.PUPPET_ERRORS:
|
||||
if len(self.state["hosts"]) > 0:
|
||||
raise ValueError(
|
||||
f"plugin: {plugin_name} should not have hosts to be specified"
|
||||
f"plugin: {plugin_name} should not have hosts to be "
|
||||
f"specified"
|
||||
)
|
||||
elif self.state["algorithm"] == algorithms.PROCESS_FAILURE:
|
||||
elif self.state["algorithm"] == algorithms.PROCESS_FAILURES:
|
||||
if len(self.state["hosts"]) > 0:
|
||||
raise ValueError(
|
||||
f"plugin: {plugin_name} should not have hosts to be specified"
|
||||
f"plugin: {plugin_name} should not have hosts to be "
|
||||
f"specified"
|
||||
)
|
||||
elif self.state["algorithm"] == algorithms.HEARTBEAT_LOSS:
|
||||
if len(self.state["hosts"]) > 0:
|
||||
raise ValueError(
|
||||
f"plugin: {plugin_name} should not have hosts to be "
|
||||
f"specified"
|
||||
)
|
||||
elif self.state["algorithm"] == algorithms.MAINTENANCE_ERR:
|
||||
if len(self.state["hosts"]) > 0:
|
||||
raise ValueError(
|
||||
f"plugin: {plugin_name} should not have hosts to be "
|
||||
f"specified"
|
||||
)
|
||||
elif self.state["algorithm"] == algorithms.DAEMON_FAILURES:
|
||||
if len(self.state["hosts"]) > 0:
|
||||
raise ValueError(
|
||||
f"plugin: {plugin_name} should not have hosts to be "
|
||||
f"specified"
|
||||
)
|
||||
elif self.state["algorithm"] == algorithms.STATE_CHANGES:
|
||||
if len(self.state["hosts"]) > 0:
|
||||
raise ValueError(
|
||||
f"plugin: {plugin_name} should not have hosts to be "
|
||||
f"specified"
|
||||
)
|
||||
elif self.state["algorithm"] == algorithms.AUDIT:
|
||||
if len(self.state["hosts"]) > 0:
|
||||
raise ValueError(
|
||||
f"plugin: {plugin_name} should not have hosts to be specified"
|
||||
f"plugin: {plugin_name} should not have hosts to be "
|
||||
f"specified"
|
||||
)
|
||||
|
||||
try:
|
||||
datetime.strptime(self.state["start"], "%Y-%m-%d %H:%M:%S")
|
||||
except:
|
||||
raise ValueError(
|
||||
f"plugin : {plugin_name} needs a start time in YYYY-MM-DD HH:MM:SS format"
|
||||
f"plugin : {plugin_name} needs a start time in YYYY-MM-DD "
|
||||
f"HH:MM:SS format"
|
||||
)
|
||||
|
||||
try:
|
||||
datetime.strptime(self.state["end"], "%Y-%m-%d %H:%M:%S")
|
||||
except:
|
||||
raise ValueError(
|
||||
f"plugin : {plugin_name} needs an end time in YYYY-MM-DD HH:MM:SS format"
|
||||
f"plugin : {plugin_name} needs an end time in YYYY-MM-DD "
|
||||
f"HH:MM:SS format"
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"plugin: {plugin_name} unknown algorithm {self.state['algorithm']}"
|
||||
f"plugin: {plugin_name} unknown algorithm "
|
||||
f"{self.state['algorithm']}"
|
||||
)
|
||||
|
||||
for host in self.state["hosts"]:
|
||||
if host not in ["controllers", "workers", "storages", "all"]:
|
||||
raise ValueError(
|
||||
f"host not recognized: '{host}', accepted hosts are 'controllers', 'workers', 'storages', 'all'"
|
||||
f"host not recognized: '{host}', accepted hosts are "
|
||||
f"'controllers', 'workers', 'storages', 'all'"
|
||||
)
|
||||
|
||||
def __str__(self) -> str:
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
algorithm=alarm
|
||||
alarm_ids=400., 401.
|
||||
entity_ids=host=controller-0, host=controller-1
|
|
@ -0,0 +1 @@
|
|||
algorithm=daemon_failures
|
|
@ -0,0 +1 @@
|
|||
algorithm=heartbeat_loss
|
|
@ -0,0 +1 @@
|
|||
algorithm=maintenance_errors
|
|
@ -0,0 +1 @@
|
|||
algorithm=process_failures
|
|
@ -0,0 +1 @@
|
|||
algorithm=puppet_errors
|
|
@ -0,0 +1 @@
|
|||
algorithm=state_changes
|
|
@ -0,0 +1,5 @@
|
|||
algorithm=substring
|
||||
files=var/log/mtcAgent.log, var/log/sm.log
|
||||
hosts=controllers
|
||||
substring=operation failed
|
||||
substring=Failed to send message
|
|
@ -0,0 +1 @@
|
|||
algorithm=swact_activity
|
|
@ -0,0 +1 @@
|
|||
algorithm=system_info
|
|
@ -17,9 +17,9 @@
|
|||
# The report tool requires the collect bundle and host tarballs to be
|
||||
# untarred.
|
||||
#
|
||||
# The report tool reads user plugins from a plugins directory in the
|
||||
# top level of the collect bundle, and outputs files containing
|
||||
# relevant logs to a report directory in the top level as well.
|
||||
# The report tool reads user plugins from the report directory in the
|
||||
# top level of the collect bundle, and outputs files containing files
|
||||
# containing relevant logs to this directory as well.
|
||||
#
|
||||
# Typical Usage:
|
||||
# command line functionality
|
||||
|
@ -39,9 +39,12 @@
|
|||
import argparse
|
||||
from cmath import log
|
||||
from datetime import datetime
|
||||
from datetime import timedelta
|
||||
from datetime import timezone
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
|
||||
from execution_engine import ExecutionEngine
|
||||
|
@ -49,39 +52,58 @@ from plugin import Plugin
|
|||
|
||||
|
||||
now = datetime.now(timezone.utc)
|
||||
base_dir = os.path.realpath(__file__)
|
||||
default_path = os.path.join(os.path.dirname(base_dir), "..", "..")
|
||||
base_dir = os.path.dirname(os.path.realpath(__file__))
|
||||
parent_dir = os.path.dirname(base_dir)
|
||||
default_path = os.path.dirname(parent_dir)
|
||||
plugins = []
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Log Event Reporter",
|
||||
epilog="Place plugins in 'plugins' directory at top level of collect bundle. Output files will be placed in 'report' directory."
|
||||
"\nThis tool will create a report.log file along with other output files",
|
||||
epilog="Place plugins in 'plugins' directory found in 'report' directory "
|
||||
"at top level of collect bundle.\nOutput files will be placed in 'report' "
|
||||
"directory.\nThis tool will create a report.log and untar.log file along "
|
||||
"with other output files.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-v",
|
||||
"--verbose",
|
||||
action="store_true",
|
||||
help="Verbose output",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-s",
|
||||
"--start",
|
||||
default="20000101",
|
||||
help="Specify a start date in YYYYMMDD format for analysis (default:20000101)",
|
||||
help="Specify a start date in YYYYMMDD format for analysis "
|
||||
"(default:20000101)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-e",
|
||||
"--end",
|
||||
default=datetime.strftime(now, "%Y%m%d"),
|
||||
help="Specify an end date in YYYYMMDD format for analysis (default: current date)",
|
||||
default=datetime.strftime(now + timedelta(days=1), "%Y%m%d"),
|
||||
help="Specify an end date in YYYYMMDD format for analysis "
|
||||
"(default: current date)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-p",
|
||||
"--plugin",
|
||||
default=None,
|
||||
nargs="*",
|
||||
help="Specify what plugins to run (default: runs every plugin in plugins folder)",
|
||||
help="Specify what plugins to run (default: runs every plugin in plugins "
|
||||
"folder)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-d",
|
||||
"--directory",
|
||||
default=default_path,
|
||||
help="Specify top level of collect bundle to analyze (default: two levels above current location)",
|
||||
help="Specify top level of collect bundle to analyze "
|
||||
"(default: two levels above tool directory)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--hostname",
|
||||
default="all",
|
||||
help="Specify host for correlator to find significant events and state "
|
||||
"changes for (default: all hosts)",
|
||||
)
|
||||
subparsers = parser.add_subparsers(help="algorithms", dest="algorithm")
|
||||
|
||||
|
@ -89,14 +111,15 @@ subparsers = parser.add_subparsers(help="algorithms", dest="algorithm")
|
|||
parser_substring = subparsers.add_parser(
|
||||
"substring",
|
||||
formatter_class=argparse.RawTextHelpFormatter,
|
||||
help="""Searches through specified files for lines containing specified substring.
|
||||
There will be an output file for each host of the host type specified.""",
|
||||
help="""Searches through specified files for lines containing specified
|
||||
substring. There will be an output file for each host of the host
|
||||
type specified.""",
|
||||
epilog="Plugin file example:\n"
|
||||
" algorithm=substring\n"
|
||||
" files=mtcAgent.log, sm.log\n"
|
||||
" hosts=controllers, workers\n"
|
||||
" substring=Swact in progress\n"
|
||||
" substring=Swact update",
|
||||
" files=var/log/mtcAgent.log, var/log/sm.log\n"
|
||||
" hosts=controllers\n"
|
||||
" substring=operation failed\n"
|
||||
" substring=Failed to send message",
|
||||
)
|
||||
substring_required = parser_substring.add_argument_group("required arguments")
|
||||
substring_required.add_argument(
|
||||
|
@ -106,7 +129,8 @@ substring_required.add_argument(
|
|||
help="Files to perform substring analysis on (required)",
|
||||
)
|
||||
substring_required.add_argument(
|
||||
"--substring", nargs="+", required=True, help="Substrings to search for (required)"
|
||||
"--substring", nargs="+", required=True,
|
||||
help="Substrings to search for (required)"
|
||||
)
|
||||
substring_required.add_argument(
|
||||
"--hosts",
|
||||
|
@ -121,11 +145,12 @@ substring_required.add_argument(
|
|||
parser_alarm = subparsers.add_parser(
|
||||
"alarm",
|
||||
formatter_class=argparse.RawTextHelpFormatter,
|
||||
help="Searches through fm.db.sql.txt for alarms and logs. There are 2 output files: 'alarm', and 'log'",
|
||||
help="Searches through fm.db.sql.txt for alarms and logs. There are 2 "
|
||||
"output files: 'alarm', and 'log'",
|
||||
epilog="Plugin file example:\n"
|
||||
" algorithm=alarm\n"
|
||||
" alarm_ids=400.005,200.004\n"
|
||||
" entity_ids= host=controller-0,host=controller-1\n",
|
||||
" alarm_ids=400.005, 200.004\n"
|
||||
" entity_ids=host=controller-0, host=controller-1\n",
|
||||
)
|
||||
parser_alarm.add_argument(
|
||||
"--alarm_ids",
|
||||
|
@ -151,27 +176,61 @@ parser_system_info = subparsers.add_parser(
|
|||
)
|
||||
|
||||
# swact activity algorithm
|
||||
parser_swact = subparsers.add_parser(
|
||||
"swact",
|
||||
parser_swact_activity = subparsers.add_parser(
|
||||
"swact_activity",
|
||||
formatter_class=argparse.RawTextHelpFormatter,
|
||||
help="Presents system swacting activity",
|
||||
epilog="Plugin file example:\n" " algorithm=swact\n",
|
||||
epilog="Plugin file example:\n" " algorithm=swact_activity\n",
|
||||
)
|
||||
|
||||
# puppet errors algorithm
|
||||
parser_puppet = subparsers.add_parser(
|
||||
"puppet",
|
||||
parser_puppet_errors = subparsers.add_parser(
|
||||
"puppet_errors",
|
||||
formatter_class=argparse.RawTextHelpFormatter,
|
||||
help="Presents any puppet errors",
|
||||
epilog="Plugin file example:\n" " algorithm=puppet\n",
|
||||
epilog="Plugin file example:\n" " algorithm=puppet_errors\n",
|
||||
)
|
||||
|
||||
# process failure algorithm
|
||||
parser_process_failure = subparsers.add_parser(
|
||||
"process_failure",
|
||||
# process failures algorithm
|
||||
parser_process_failures = subparsers.add_parser(
|
||||
"process_failures",
|
||||
formatter_class=argparse.RawTextHelpFormatter,
|
||||
help="Presents any process failures from pmond.log",
|
||||
epilog="Plugin file example:\n" " algorithm=process_failure\n",
|
||||
epilog="Plugin file example:\n" " algorithm=process_failures\n",
|
||||
)
|
||||
|
||||
# daemon failures algorithm
|
||||
parser_daemon_failures = subparsers.add_parser(
|
||||
"daemon_failures",
|
||||
formatter_class=argparse.RawTextHelpFormatter,
|
||||
help="Presents any puppet manifest failures from daemon.log",
|
||||
epilog="Plugin file example:\n" " algorithm=daemon_failures\n",
|
||||
)
|
||||
|
||||
# heartbeat loss algorithm
|
||||
parser_heartbeat_loss = subparsers.add_parser(
|
||||
"heartbeat_loss",
|
||||
formatter_class=argparse.RawTextHelpFormatter,
|
||||
help="Presents any heartbeat loss error messages from hbsAgent.log",
|
||||
epilog="Plugin file example:\n" " algorithm=heartbeat_loss\n",
|
||||
)
|
||||
|
||||
# maintenance errors algorithm
|
||||
parser_maintenance_errors = subparsers.add_parser(
|
||||
"maintenance_errors",
|
||||
formatter_class=argparse.RawTextHelpFormatter,
|
||||
help="Presents errors and other relevant messages from mtcAgent.log and "
|
||||
"mtcClient.log",
|
||||
epilog="Plugin file example:\n" " algorithm=maintenance_errors\n",
|
||||
)
|
||||
|
||||
# state changes algorithm
|
||||
parser_state_changes = subparsers.add_parser(
|
||||
"state_changes",
|
||||
formatter_class=argparse.RawTextHelpFormatter,
|
||||
help="Presents any messages from mtcAgent.log regarding the state of "
|
||||
"hosts, such as enabled/disabled",
|
||||
epilog="Plugin file example:\n" " algorithm=state_changes\n",
|
||||
)
|
||||
|
||||
# audit algorithm
|
||||
|
@ -185,24 +244,41 @@ parser_audit = subparsers.add_parser(
|
|||
" start=2022-06-01 10:00:00\n"
|
||||
" end=2022-06-01 04:00:00\n",
|
||||
)
|
||||
parser_audit_required = parser_audit.add_argument_group("required arguments")
|
||||
parser_audit_required.add_argument("--start", required=True)
|
||||
parser_audit_required.add_argument(
|
||||
parser_audit.add_argument(
|
||||
"--start",
|
||||
required=False,
|
||||
default=datetime.strftime(now - timedelta(days=7), "%Y-%m-%d %H:%M:%S"),
|
||||
type=str,
|
||||
help="Specify a start date in YYYY-MM-DD HH:MM:SS format for analysis "
|
||||
"(not required, default: 1 week ago)"
|
||||
)
|
||||
parser_audit.add_argument(
|
||||
"--end",
|
||||
required=True,
|
||||
required=False,
|
||||
default=datetime.strftime(now, "%Y-%m-%d %H:%M:%S"),
|
||||
type=str,
|
||||
help="Specify an end date in YYYY-MM-DD HH:MM:SS format for analysis "
|
||||
"(not required, default: today)"
|
||||
)
|
||||
|
||||
|
||||
args = parser.parse_args()
|
||||
args.start = datetime.strptime(args.start, "%Y%m%d").strftime("%Y-%m-%dT%H:%M:%S")
|
||||
args.start = datetime.strptime(args.start, "%Y%m%d").strftime(
|
||||
"%Y-%m-%dT%H:%M:%S")
|
||||
args.end = datetime.strptime(args.end, "%Y%m%d").strftime("%Y-%m-%dT%H:%M:%S")
|
||||
|
||||
output_directory = os.path.join(
|
||||
args.directory, "report", "output", now.strftime("%Y%m%d.%H%M%S")
|
||||
)
|
||||
if args.directory.endswith("/"):
|
||||
output_directory = os.path.join(
|
||||
default_path, "report", "output",
|
||||
os.path.basename(os.path.dirname(args.directory))
|
||||
)
|
||||
else:
|
||||
output_directory = os.path.join(
|
||||
default_path, "report", "output", os.path.basename(args.directory)
|
||||
)
|
||||
|
||||
# creating report log
|
||||
os.makedirs(output_directory)
|
||||
os.makedirs(output_directory, exist_ok=True)
|
||||
open(os.path.join(output_directory, "report.log"), "w").close()
|
||||
|
||||
# setting up logger
|
||||
|
@ -223,17 +299,44 @@ ch.setFormatter(formatter)
|
|||
|
||||
logger.addHandler(ch)
|
||||
|
||||
if not os.path.isdir(args.directory):
|
||||
sys.exit("Top level of collect bundle given to analyze is not a directory")
|
||||
else:
|
||||
for obj in (os.scandir(args.directory)):
|
||||
info = os.path.splitext(obj.name)
|
||||
|
||||
# TODO: ask user which file to report on if more than one tarball in
|
||||
# directory
|
||||
# Check if collect tarball is in given directory and extracts it if
|
||||
# not already done
|
||||
if (obj.is_file() and info[1] == ".tar"):
|
||||
try:
|
||||
result = subprocess.check_output(["tar", "tf", obj.path],
|
||||
encoding="UTF-8")
|
||||
result = result.split("\n", 1)
|
||||
if not os.path.isdir(os.path.join(args.directory,
|
||||
os.path.dirname(result[0]))):
|
||||
subprocess.run(["tar", "xfC", obj.path, args.directory],
|
||||
check=True)
|
||||
subprocess.run(["echo", "extracted", obj.name], check=True)
|
||||
args.directory = os.path.join(args.directory,
|
||||
os.path.dirname(result[0]))
|
||||
break
|
||||
except subprocess.CalledProcessError as e:
|
||||
logger.error(e)
|
||||
|
||||
try:
|
||||
engine = ExecutionEngine(args)
|
||||
engine = ExecutionEngine(args, output_directory)
|
||||
except ValueError as e:
|
||||
logger.error(str(e))
|
||||
sys.exit("Confirm you are running the report tool on a collect bundle")
|
||||
|
||||
if args.algorithm:
|
||||
plugins.append(Plugin(opts=vars(args)))
|
||||
else:
|
||||
if args.plugin:
|
||||
for p in args.plugin:
|
||||
path = os.path.join(args.directory, "plugins", p)
|
||||
path = os.path.join(default_path, "report", "plugins", p)
|
||||
if os.path.exists(path):
|
||||
try:
|
||||
plugins.append(Plugin(path))
|
||||
|
@ -243,7 +346,7 @@ else:
|
|||
else:
|
||||
logger.warning(f"{p} plugin does not exist")
|
||||
else:
|
||||
path = os.path.join(args.directory, "plugins")
|
||||
path = os.path.join(default_path, "report", "plugins")
|
||||
if not os.path.exists(path):
|
||||
os.mkdir(path)
|
||||
logger.error("Plugins folder is empty")
|
||||
|
|
Loading…
Reference in New Issue