Add /var/crash dump management to maintenance.

The Linux kernel can be configured to perform a
crash dump and reboot in response to specific,
typically serious, events.

A crash dump event produces a crash dump report
bundle (directory) of files that represent the
state of the kernel at the time of the event ;
usefull for post-event root cause analysis.

The kernel directs new crash dump bundles to
/var/crash/<dated vmcore bundle>. Crash dump
bundles are quite large and, if too many occur,
can fill up its target filesystem.

This update adds crash dump bundle management
to the maintenance with a new crashDumpMgr
service script and installs a crash dump
logrotation configuration file to
compress/preserve the first crash bundle and
compress/rotate all subsequent bundles.

With repeated crash dumps and the help of
backgroud logrotation this update produces
the following compressed crash dump bundles

controller-1:~$ ls -lrth /var/log/crash
total 238M
-rw-r--r-- 1 root 77M <date> vmcore_first.tar.1.gz
-rw-r--r-- 1 root 75M <date> vmcore.tar.1.gz

Change-Id: I2741e610c6c417d7fc14dfada283a1edacd9327f
Partial-Fix: 1898602
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
Eric MacDonald 2020-10-15 15:51:04 -04:00
parent f5725ad694
commit 85f605a762
4 changed files with 165 additions and 0 deletions

View File

@ -349,6 +349,8 @@ install -m 700 -p -D %{_buildsubdir}/alarm/scripts/mtcalarm.init %{buildroot}%{_
# TODO: Init hack. Should move to proper module
install -m 755 -p -D %{_buildsubdir}/scripts/hwclock.sh %{buildroot}%{_sysconfdir}/init.d/hwclock.sh
install -m 644 -p -D %{_buildsubdir}/scripts/hwclock.service %{buildroot}%{_unitdir}/hwclock.service
install -m 755 -p -D %{_buildsubdir}/scripts/crashDumpMgr %{buildroot}%{_sysconfdir}/init.d/crashDumpMgr
install -m 644 -p -D %{_buildsubdir}/scripts/crashDumpMgr.service %{buildroot}%{_unitdir}/crashDumpMgr.service
# systemd service files
install -m 644 -p -D %{_buildsubdir}/fsmon/scripts/fsmon.service %{buildroot}%{_unitdir}/fsmon.service
@ -395,6 +397,7 @@ install -m 644 -p -D %{_buildsubdir}/lmon/scripts/lmon.pmon.conf %{buildroot}%{l
# log rotation
install -m 755 -d %{buildroot}%{_sysconfdir}/logrotate.d
install -m 644 -p -D %{_buildsubdir}/scripts/crashdump.logrotate %{buildroot}%{local_etc_logrotated}/crashdump.logrotate
install -m 644 -p -D %{_buildsubdir}/scripts/mtce.logrotate %{buildroot}%{local_etc_logrotated}/mtce.logrotate
install -m 644 -p -D %{_buildsubdir}/hostw/scripts/hostw.logrotate %{buildroot}%{local_etc_logrotated}/hostw.logrotate
install -m 644 -p -D %{_buildsubdir}/pmon/scripts/pmon.logrotate %{buildroot}%{local_etc_logrotated}/pmon.logrotate
@ -424,6 +427,7 @@ install -m 755 -d %{buildroot}/var/run
/bin/systemctl enable rsyncd.service
/bin/systemctl enable goenabled.service
/bin/systemctl enable mtcalarm.service
/bin/systemctl enable crashDumpMgr.service
%post -n mtce-hostw
/bin/systemctl enable hostw.service
@ -470,6 +474,7 @@ install -m 755 -d %{buildroot}/var/run
%{local_etc_logrotated}/fsmon.logrotate
%{local_etc_logrotated}/mtce.logrotate
%{local_etc_logrotated}/mtcalarm.logrotate
%{local_etc_logrotated}/crashdump.logrotate
# Maintenance start/stop services scripts
%{local_etc_servicesd}/controller/mtcTest
@ -490,6 +495,7 @@ install -m 755 -d %{buildroot}/var/run
%{_sysconfdir}/init.d/mtcClient
%{_sysconfdir}/init.d/mtcalarm
%{_sysconfdir}/init.d/hwclock.sh
%{_sysconfdir}/init.d/crashDumpMgr
%{_unitdir}/runservices.service
%{_unitdir}/goenabled.service
@ -499,6 +505,7 @@ install -m 755 -d %{buildroot}/var/run
%{_unitdir}/mtcClient.service
%{_unitdir}/hbsClient.service
%{_unitdir}/hwclock.service
%{_unitdir}/crashDumpMgr.service
# Binaries
%{local_bindir}/mtcAgent

View File

@ -0,0 +1,131 @@
#!/bin/bash
#
# Copyright (c) 2020 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
# chkconfig: 2345 98 2
#
### BEGIN INIT INFO
# Provides: crashDumpMgr
# Required-Start: $null
# Required-Stop: $null
# Default-Start: 3 5
# Default-Stop: 0 1 2 6
# Short-Description: Maintenance 'Crash Dump' Manager script
### END INIT INFO
CRASHDUMPMGR_TAG=${CRASHDUMPMGR_TAG:-"crashDumpMgr"}
RETVAL=0
#############################################################################
# Log message to syslog
#############################################################################
function log()
{
logger -t ${CRASHDUMPMGR_TAG} $@
}
#############################################################################
#
# Name : manage_crash_dumps
#
# Purpose: Prevent crash dumps from filling up the root fs
#
# The kernel directs new crash dump bundles to
# /var/crash/<dated vmcore bundle>. Crash dump
# bundles are quite large and, if too many occur,
# can fill up its target filesystem.
#
# This function nicely tars a crash bundle found in /var/crash
# to /var/log/crash.
#
# The first bundle is tar'ed as vmcore_first.tar and preserved.
# Subsequent crash bundles are nicely tar'ed as vmcore.tar
#
# Save the crash dump vmcore summary for all crash dumps.
#
# Assumptions: logration is used to compress these bundles in the background
#
############################################################################
function manage_crash_dumps()
{
CRASH_DIR="/var/crash"
CRASH_BUNDLE_DIR="/var/log/crash"
OTHER_BUNDLE="${CRASH_BUNDLE_DIR}/vmcore.tar"
FIRST_BUNDLE="${CRASH_BUNDLE_DIR}/vmcore_first.tar"
FIRST_BUNDLE_ROTATED="${CRASH_BUNDLE_DIR}/vmcore_first.tar.1.gz"
CRASH_BUNDLE_SUMMARY="vmcore-dmesg.txt"
# tar command and nice levels
TAR_CMD="tar -cf"
NICE_CMD="/usr/bin/nice -n19"
IONICE_CMD="/usr/bin/ionice -c2 -n7"
log "managing ${CRASH_DIR}"
cleanup=false
# create dir if it does not exist
if [ ! -d ${CRASH_BUNDLE_DIR} ] ; then
mkdir ${CRASH_BUNDLE_DIR}
fi
for entry in ${CRASH_DIR}/*
do
if [ -d ${entry} ] ; then
if [ -e ${entry}/vmcore ] ; then
# save the crash dump vmcore summary for all crash dumps
cp -a ${entry}/${CRASH_BUNDLE_SUMMARY} ${CRASH_DIR}/$(basename ${entry})_${CRASH_BUNDLE_SUMMARY}
if [ "${cleanup}" != true ] ; then
if [ -e ${FIRST_BUNDLE} -o -e ${FIRST_BUNDLE_ROTATED} ] ; then
if [ ! -e ${OTHER_BUNDLE} ] ; then
log "creating bundle from ${entry}"
${IONICE_CMD} ${NICE_CMD} ${TAR_CMD} ${OTHER_BUNDLE} -C ${CRASH_DIR} $(basename ${entry})
cleanup=true
fi
else
log "creating first bundle from ${entry}"
${IONICE_CMD} ${NICE_CMD} ${TAR_CMD} ${FIRST_BUNDLE} -C ${CRASH_DIR} $(basename ${entry})
cleanup=true
fi
fi
log "removing ${entry}"
rm -rf "${entry}"
fi
fi
done
}
# service case
case "$1" in
start)
manage_crash_dumps
;;
stop)
log "stop"
;;
restart)
log "restart"
stop
start
;;
status)
log "status"
;;
*)
log "usage: $0 { start | stop | status | restart }"
RETVAL=1
;;
esac
exit $RETVAL

View File

@ -0,0 +1,13 @@
[Unit]
Description=Crash Dump Manager
After=network.target
Before=sshd.service
[Service]
Type=oneshot
RemainAfterExit=no
ExecStart=/etc/init.d/crashDumpMgr start
ExecStop=/etc/init.d/crashDumpMgr stop
[Install]
WantedBy=multi-user.target

View File

@ -0,0 +1,14 @@
/var/log/crash/vmcore.tar
/var/log/crash/vmcore_first.tar
{
nodateext
size 1K
start 1
rotate 1
missingok
notifempty
compress
postrotate
rm -f $1
endscript
}