From 85f605a7620392bfc8bca7ef8eff7864bddaffc3 Mon Sep 17 00:00:00 2001 From: Eric MacDonald Date: Thu, 15 Oct 2020 15:51:04 -0400 Subject: [PATCH] Add /var/crash dump management to maintenance. The Linux kernel can be configured to perform a crash dump and reboot in response to specific, typically serious, events. A crash dump event produces a crash dump report bundle (directory) of files that represent the state of the kernel at the time of the event ; usefull for post-event root cause analysis. The kernel directs new crash dump bundles to /var/crash/. Crash dump bundles are quite large and, if too many occur, can fill up its target filesystem. This update adds crash dump bundle management to the maintenance with a new crashDumpMgr service script and installs a crash dump logrotation configuration file to compress/preserve the first crash bundle and compress/rotate all subsequent bundles. With repeated crash dumps and the help of backgroud logrotation this update produces the following compressed crash dump bundles controller-1:~$ ls -lrth /var/log/crash total 238M -rw-r--r-- 1 root 77M vmcore_first.tar.1.gz -rw-r--r-- 1 root 75M vmcore.tar.1.gz Change-Id: I2741e610c6c417d7fc14dfada283a1edacd9327f Partial-Fix: 1898602 Signed-off-by: Eric MacDonald --- mtce/centos/mtce.spec | 7 ++ mtce/src/scripts/crashDumpMgr | 131 ++++++++++++++++++++++++++ mtce/src/scripts/crashDumpMgr.service | 13 +++ mtce/src/scripts/crashdump.logrotate | 14 +++ 4 files changed, 165 insertions(+) create mode 100644 mtce/src/scripts/crashDumpMgr create mode 100644 mtce/src/scripts/crashDumpMgr.service create mode 100644 mtce/src/scripts/crashdump.logrotate diff --git a/mtce/centos/mtce.spec b/mtce/centos/mtce.spec index d7b0c683..96211df0 100644 --- a/mtce/centos/mtce.spec +++ b/mtce/centos/mtce.spec @@ -349,6 +349,8 @@ install -m 700 -p -D %{_buildsubdir}/alarm/scripts/mtcalarm.init %{buildroot}%{_ # TODO: Init hack. Should move to proper module install -m 755 -p -D %{_buildsubdir}/scripts/hwclock.sh %{buildroot}%{_sysconfdir}/init.d/hwclock.sh install -m 644 -p -D %{_buildsubdir}/scripts/hwclock.service %{buildroot}%{_unitdir}/hwclock.service +install -m 755 -p -D %{_buildsubdir}/scripts/crashDumpMgr %{buildroot}%{_sysconfdir}/init.d/crashDumpMgr +install -m 644 -p -D %{_buildsubdir}/scripts/crashDumpMgr.service %{buildroot}%{_unitdir}/crashDumpMgr.service # systemd service files install -m 644 -p -D %{_buildsubdir}/fsmon/scripts/fsmon.service %{buildroot}%{_unitdir}/fsmon.service @@ -395,6 +397,7 @@ install -m 644 -p -D %{_buildsubdir}/lmon/scripts/lmon.pmon.conf %{buildroot}%{l # log rotation install -m 755 -d %{buildroot}%{_sysconfdir}/logrotate.d +install -m 644 -p -D %{_buildsubdir}/scripts/crashdump.logrotate %{buildroot}%{local_etc_logrotated}/crashdump.logrotate install -m 644 -p -D %{_buildsubdir}/scripts/mtce.logrotate %{buildroot}%{local_etc_logrotated}/mtce.logrotate install -m 644 -p -D %{_buildsubdir}/hostw/scripts/hostw.logrotate %{buildroot}%{local_etc_logrotated}/hostw.logrotate install -m 644 -p -D %{_buildsubdir}/pmon/scripts/pmon.logrotate %{buildroot}%{local_etc_logrotated}/pmon.logrotate @@ -424,6 +427,7 @@ install -m 755 -d %{buildroot}/var/run /bin/systemctl enable rsyncd.service /bin/systemctl enable goenabled.service /bin/systemctl enable mtcalarm.service +/bin/systemctl enable crashDumpMgr.service %post -n mtce-hostw /bin/systemctl enable hostw.service @@ -470,6 +474,7 @@ install -m 755 -d %{buildroot}/var/run %{local_etc_logrotated}/fsmon.logrotate %{local_etc_logrotated}/mtce.logrotate %{local_etc_logrotated}/mtcalarm.logrotate +%{local_etc_logrotated}/crashdump.logrotate # Maintenance start/stop services scripts %{local_etc_servicesd}/controller/mtcTest @@ -490,6 +495,7 @@ install -m 755 -d %{buildroot}/var/run %{_sysconfdir}/init.d/mtcClient %{_sysconfdir}/init.d/mtcalarm %{_sysconfdir}/init.d/hwclock.sh +%{_sysconfdir}/init.d/crashDumpMgr %{_unitdir}/runservices.service %{_unitdir}/goenabled.service @@ -499,6 +505,7 @@ install -m 755 -d %{buildroot}/var/run %{_unitdir}/mtcClient.service %{_unitdir}/hbsClient.service %{_unitdir}/hwclock.service +%{_unitdir}/crashDumpMgr.service # Binaries %{local_bindir}/mtcAgent diff --git a/mtce/src/scripts/crashDumpMgr b/mtce/src/scripts/crashDumpMgr new file mode 100644 index 00000000..d57be7ba --- /dev/null +++ b/mtce/src/scripts/crashDumpMgr @@ -0,0 +1,131 @@ +#!/bin/bash +# +# Copyright (c) 2020 Wind River Systems, Inc. +# +# SPDX-License-Identifier: Apache-2.0 +# + +# chkconfig: 2345 98 2 +# +### BEGIN INIT INFO +# Provides: crashDumpMgr +# Required-Start: $null +# Required-Stop: $null +# Default-Start: 3 5 +# Default-Stop: 0 1 2 6 +# Short-Description: Maintenance 'Crash Dump' Manager script +### END INIT INFO + +CRASHDUMPMGR_TAG=${CRASHDUMPMGR_TAG:-"crashDumpMgr"} + +RETVAL=0 + +############################################################################# +# Log message to syslog +############################################################################# + +function log() +{ + logger -t ${CRASHDUMPMGR_TAG} $@ +} + +############################################################################# +# +# Name : manage_crash_dumps +# +# Purpose: Prevent crash dumps from filling up the root fs +# +# The kernel directs new crash dump bundles to +# /var/crash/. Crash dump +# bundles are quite large and, if too many occur, +# can fill up its target filesystem. +# +# This function nicely tars a crash bundle found in /var/crash +# to /var/log/crash. +# +# The first bundle is tar'ed as vmcore_first.tar and preserved. +# Subsequent crash bundles are nicely tar'ed as vmcore.tar +# +# Save the crash dump vmcore summary for all crash dumps. +# +# Assumptions: logration is used to compress these bundles in the background +# +############################################################################ + +function manage_crash_dumps() +{ + CRASH_DIR="/var/crash" + CRASH_BUNDLE_DIR="/var/log/crash" + OTHER_BUNDLE="${CRASH_BUNDLE_DIR}/vmcore.tar" + FIRST_BUNDLE="${CRASH_BUNDLE_DIR}/vmcore_first.tar" + FIRST_BUNDLE_ROTATED="${CRASH_BUNDLE_DIR}/vmcore_first.tar.1.gz" + CRASH_BUNDLE_SUMMARY="vmcore-dmesg.txt" + + # tar command and nice levels + TAR_CMD="tar -cf" + NICE_CMD="/usr/bin/nice -n19" + IONICE_CMD="/usr/bin/ionice -c2 -n7" + + log "managing ${CRASH_DIR}" + cleanup=false + + # create dir if it does not exist + if [ ! -d ${CRASH_BUNDLE_DIR} ] ; then + mkdir ${CRASH_BUNDLE_DIR} + fi + + for entry in ${CRASH_DIR}/* + do + if [ -d ${entry} ] ; then + if [ -e ${entry}/vmcore ] ; then + + # save the crash dump vmcore summary for all crash dumps + cp -a ${entry}/${CRASH_BUNDLE_SUMMARY} ${CRASH_DIR}/$(basename ${entry})_${CRASH_BUNDLE_SUMMARY} + + if [ "${cleanup}" != true ] ; then + if [ -e ${FIRST_BUNDLE} -o -e ${FIRST_BUNDLE_ROTATED} ] ; then + if [ ! -e ${OTHER_BUNDLE} ] ; then + log "creating bundle from ${entry}" + ${IONICE_CMD} ${NICE_CMD} ${TAR_CMD} ${OTHER_BUNDLE} -C ${CRASH_DIR} $(basename ${entry}) + cleanup=true + fi + else + log "creating first bundle from ${entry}" + ${IONICE_CMD} ${NICE_CMD} ${TAR_CMD} ${FIRST_BUNDLE} -C ${CRASH_DIR} $(basename ${entry}) + cleanup=true + fi + fi + log "removing ${entry}" + rm -rf "${entry}" + fi + fi + done +} + +# service case +case "$1" in + start) + manage_crash_dumps + ;; + + stop) + log "stop" + ;; + + restart) + log "restart" + stop + start + ;; + + status) + log "status" + ;; + + *) + log "usage: $0 { start | stop | status | restart }" + RETVAL=1 + ;; +esac + +exit $RETVAL diff --git a/mtce/src/scripts/crashDumpMgr.service b/mtce/src/scripts/crashDumpMgr.service new file mode 100644 index 00000000..49f979e8 --- /dev/null +++ b/mtce/src/scripts/crashDumpMgr.service @@ -0,0 +1,13 @@ +[Unit] +Description=Crash Dump Manager +After=network.target +Before=sshd.service + +[Service] +Type=oneshot +RemainAfterExit=no +ExecStart=/etc/init.d/crashDumpMgr start +ExecStop=/etc/init.d/crashDumpMgr stop + +[Install] +WantedBy=multi-user.target diff --git a/mtce/src/scripts/crashdump.logrotate b/mtce/src/scripts/crashdump.logrotate new file mode 100644 index 00000000..a16bcb7c --- /dev/null +++ b/mtce/src/scripts/crashdump.logrotate @@ -0,0 +1,14 @@ +/var/log/crash/vmcore.tar +/var/log/crash/vmcore_first.tar +{ + nodateext + size 1K + start 1 + rotate 1 + missingok + notifempty + compress + postrotate + rm -f $1 + endscript +}