Remove sm-watchdog service since NFS is now stable

sm-watchdog was introduced as a workaround because of NFS hung. Another
clean fix is already provided, but the sm-watchdog was not removed.

Test plan:
[centos] build, install and unlock.
[debian] build, install and unlock.

Story: 2010087
Task: 46007

Signed-off-by: Davi Frossard <dbarrosf@windriver.com>
Change-Id: I29fffff4e8982dc504f104f49c6586f7c74527fb
This commit is contained in:
Davi Frossard 2022-08-11 15:59:17 -04:00
parent 924c088f3a
commit bd9e560d4b
30 changed files with 4 additions and 1610 deletions

View File

@ -44,7 +44,6 @@
sm-tools: true
sm-api: true
sm-eru: true
sm-watchdog: true
mysql: false
postgresql: true
tls-proxy: false

View File

@ -156,14 +156,9 @@ function cleanup_sm_common {
$STX_INST_DIR/lib64/libsm_common.so.* \
$STX_BIN_DIR/sm-eru \
$STX_BIN_DIR/sm-eru-dump \
$STX_BIN_DIR/sm-watchdog \
$STX_SM_VAR_DIR/watchdog/modules/libsm_watchdog_nfs.so.* \
$STX_SYSCONFDIR/systemd/system/sm-eru.service \
$STX_SYSCONFDIR/systemd/system/sm-watchdog.service \
$STX_SYSCONFDIR/pmon.d/sm-eru.conf \
$STX_SYSCONFDIR/pmon.d/sm-watchdog.conf \
$STX_SYSCONFDIR/init.d/sm-eru \
$STX_SYSCONFDIR/init.d/sm-watchdog \
/etc/ld.so.conf.d/stx-ha.conf
popd
@ -190,7 +185,6 @@ function configure_ha {
if is_service_enabled sm-common; then
config_eru
config_watchdog
fi
if is_service_enabled sm-daemon; then
@ -215,12 +209,6 @@ function config_eru {
iniset -sudo ${STX_SYSCONFDIR}/systemd/system/devstack@sm-eru.service "Service" "PIDFile" "/var/run/sm-eru.pid"
}
function config_watchdog {
sudo sed -i "s%SM_WATCHDOG=\"/usr/bin/\${SM_WATCHDOG_NAME}\"%SM_WATCHDOG=\"$STX_INST_DIR/bin/\${SM_WATCHDOG_NAME}\"%" $STX_SYSCONFDIR/init.d/sm-watchdog
iniset -sudo ${STX_SYSCONFDIR}/systemd/system/devstack@sm-watchdog.service "Service" "Type" "forking"
iniset -sudo ${STX_SYSCONFDIR}/systemd/system/devstack@sm-watchdog.service "Service" "PIDFile" "/var/run/sm-watchdog.pid"
}
function create_sm_accounts {
create_service_user "smapi"
get_or_create_service "smapi" "servicemanagement" "Service Management"
@ -340,15 +328,11 @@ function install_sm_common {
install_sm_common_libs
sudo install -m 0755 -p -D -t $STX_SM_VAR_DIR/watchdog/modules src/libsm_watchdog_nfs.so.${STX_SM_COMMON_VERSION}
sudo cp -P src/libsm_watchdog_nfs.so src/libsm_watchdog_nfs.so.${STX_SM_COMMON_VERSION%%.*} $STX_SM_VAR_DIR/watchdog/modules
# scripts/
(cd scripts; sudo make DEST_DIR= UNIT_DIR=$STX_SYSCONFDIR/systemd/system install)
sudo install -m 750 -p -D src/sm_eru $STX_BIN_DIR/sm-eru
sudo install -m 750 -p -D src/sm_eru_dump $STX_BIN_DIR/sm-eru-dump
sudo install -m 750 -p -D src/sm_watchdog $STX_BIN_DIR/sm-watchdog
echo $STX_INST_DIR/lib64 | sudo tee /etc/ld.so.conf.d/stx-ha.conf
sudo ldconfig
@ -411,10 +395,6 @@ function start_eru {
run_process sm-eru "${STX_SYSCONFDIR}/init.d/sm-eru start" root root
}
function start_watchdog {
run_process sm-watchdog "${STX_SYSCONFDIR}/init.d/sm-watchdog start" root root
}
function start_ha {
if is_service_enabled sm-daemon; then
start_sm
@ -426,14 +406,12 @@ function start_ha {
if is_service_enabled sm-common; then
start_eru
start_watchdog
fi
}
function stop_ha {
if is_service_enabled sm-common; then
stop_process sm-eru
stop_process sm-watchdog
fi
if is_service_enabled sm-api; then

View File

@ -16,14 +16,10 @@ install:
install -m 750 -d $(DEST_DIR)/usr/bin
install -m 750 -p -D $(BUILDSUBDIR)/src/sm_eru $(DEST_DIR)/$(BIN_DIR)/sm-eru
install -m 750 -p -D $(BUILDSUBDIR)/src/sm_eru_dump $(DEST_DIR)/$(BIN_DIR)/sm-eru-dump
install -m 750 -p -D $(BUILDSUBDIR)/src/sm_watchdog $(DEST_DIR)/$(BIN_DIR)/sm-watchdog
install -m 644 -p -D $(BUILDSUBDIR)/scripts/sm-eru.service $(DEST_DIR)/$(UNIT_DIR)/sm-eru.service
install -m 644 -p -D $(BUILDSUBDIR)/scripts/sm-watchdog.service $(DEST_DIR)/$(UNIT_DIR)/sm-watchdog.service
install -m 750 -d $(DEST_DIR)/$(ETC_DIR)/pmon.d
install -m 640 -p -D $(BUILDSUBDIR)/scripts/sm-eru.conf $(DEST_DIR)/$(ETC_DIR)/pmon.d/sm-eru.conf
install -m 640 -p -D $(BUILDSUBDIR)/scripts/sm-watchdog.conf $(DEST_DIR)/$(ETC_DIR)/pmon.d/sm-watchdog.conf
install -m 750 -p -D $(BUILDSUBDIR)/scripts/sm-eru $(DEST_DIR)/$(ETC_DIR)/init.d/sm-eru
install -m 750 -p -D $(BUILDSUBDIR)/scripts/sm-watchdog $(DEST_DIR)/$(ETC_DIR)/init.d/sm-watchdog
clean:
@( cd src; make clean )

View File

@ -91,9 +91,6 @@ MAJOR=`echo $VER | awk -F . '{print $1}'`
MINOR=`echo $VER | awk -F . '{print $2}'`
make DEST_DIR=%{buildroot} BIN_DIR=%{_bindir} UNIT_DIR=%{_unitdir} LIB_DIR=%{_libdir} INC_DIR=%{_includedir} BUILDSUBDIR=%{_buildsubdir} VER=$VER VER_MJR=$MAJOR install
%post
/usr/bin/systemctl enable sm-watchdog.service >/dev/null 2>&1
%post -n sm-eru
/usr/bin/systemctl enable sm-eru.service >/dev/null 2>&1
@ -101,10 +98,6 @@ make DEST_DIR=%{buildroot} BIN_DIR=%{_bindir} UNIT_DIR=%{_unitdir} LIB_DIR=%{_li
%files
%license LICENSE
%defattr(-,root,root,-)
/etc/init.d/sm-watchdog
/etc/pmon.d/sm-watchdog.conf
/usr/bin/sm-watchdog
/usr/lib/systemd/system/sm-watchdog.service
#%{_unitdir}/*
#%{_bindir}/*
@ -113,10 +106,6 @@ make DEST_DIR=%{buildroot} BIN_DIR=%{_bindir} UNIT_DIR=%{_unitdir} LIB_DIR=%{_li
%files libs
%{_libdir}/*.so.*
%dir "/var/lib/sm"
%dir "/var/lib/sm/watchdog"
%dir "/var/lib/sm/watchdog/modules"
/var/lib/sm/watchdog/modules/*.so.*
%files -n sm-eru
%defattr(-,root,root,-)
@ -135,18 +124,14 @@ make DEST_DIR=%{buildroot} BIN_DIR=%{_bindir} UNIT_DIR=%{_unitdir} LIB_DIR=%{_li
#"/usr/lib64/.debug/libsm_common.so.1.0.0"
#%dir "/usr/bin/.debug"
#"/usr/bin/.debug/sm-eru-dump"
#"/usr/bin/.debug/sm-watchdog"
#"/usr/bin/.debug/sm-eru"
#%dir "/usr/src/debug/sm-common"
#%dir "/usr/src/debug/sm-common/1.0.0-r7"
#%dir "/usr/src/debug/sm-common/1.0.0-r7/src"
#/usr/src/debug/sm-common/1.0.0-r7/src/*.h
#/usr/src/debug/sm-common/1.0.0-r7/src/*.c
#%dir "/var/lib/sm/watchdog/modules/.debug"
#"/var/lib/sm/watchdog/modules/.debug/libsm_watchdog_nfs.so.1.0.0"
%files dev
%defattr(-,root,root,-)
%{_includedir}/*
%{_libdir}/*.so
/var/lib/sm/watchdog/modules/libsm_watchdog_nfs.so

View File

@ -23,11 +23,8 @@ override_dh_auto_install:
# Prevents dh_fixperms from changing the permissions defined in the makefiles
override_dh_fixperms:
dh_fixperms \
-Xsm-watchdog* \
-Xlibsm_common.so.* \
-Xlibsm_watchdog_nfs.so.* \
-Xsm-eru*
override_dh_installsystemd:
dh_installsystemd -psm-common sm-watchdog.service
dh_installsystemd -psm-eru sm-eru.service

View File

@ -1,3 +1,2 @@
usr/include/*
usr/lib/*.so
var/lib/sm/watchdog/modules/libsm_watchdog_nfs.so

View File

@ -1,3 +1 @@
/var/lib/sm
/var/lib/sm/watchdog
/var/lib/sm/watchdog/modules

View File

@ -1,2 +1 @@
usr/lib/*.so.*
var/lib/sm/watchdog/modules/*.so.*

View File

@ -1,5 +1 @@
etc/init.d/sm-watchdog
etc/pmon.d/sm-watchdog.conf
usr/bin/sm-watchdog
lib/systemd/system/sm-watchdog.service
debian/systemd/00-sm-common.preset etc/systemd/system-preset

View File

@ -1 +0,0 @@
enable sm-watchdog.service

View File

@ -72,19 +72,6 @@ MAJOR=`echo $VER | awk -F . '{print $1}'`
MINOR=`echo $VER | awk -F . '{print $2}'`
make DEST_DIR=%{buildroot} BIN_DIR=%{_bindir} UNIT_DIR=%{_unitdir} LIB_DIR=%{_libdir} INC_DIR=%{_includedir} BUILDSUBDIR=%{_buildsubdir} VER=$VER VER_MJR=$MAJOR install
%pre
%service_add_pre sm-watchdog.service sm-watchdog.target
%preun
%service_del_preun sm-watchdog.service sm-watchdog.target
%post
%service_add_post sm-watchdog.service sm-watchdog.target
/usr/bin/systemctl enable sm-watchdog.service
%postun
%service_del_postun sm-watchdog.service sm-watchdog.target
%pre -n sm-eru
%service_add_pre sm-eru.service sm-eru.target
@ -108,17 +95,10 @@ make DEST_DIR=%{buildroot} BIN_DIR=%{_bindir} UNIT_DIR=%{_unitdir} LIB_DIR=%{_li
%files
%license LICENSE
%defattr(-,root,root,-)
%{_sysconfdir}/init.d/sm-watchdog
%config %{_sysconfdir}/pmon.d/sm-watchdog.conf
%{_bindir}/sm-watchdog
%{_unitdir}/sm-watchdog.service
%files libs
%{_libdir}/*.so.*
%dir %{_sharedstatedir}/sm
%dir %{_sharedstatedir}/sm/watchdog
%dir %{_sharedstatedir}/sm/watchdog/modules
%{_sharedstatedir}/sm/watchdog/modules/*.so.*
%files -n sm-eru
%defattr(-,root,root,-)
@ -134,6 +114,5 @@ make DEST_DIR=%{buildroot} BIN_DIR=%{_bindir} UNIT_DIR=%{_unitdir} LIB_DIR=%{_li
%defattr(-,root,root,-)
%{_includedir}/*
%{_libdir}/*.so
%{_sharedstatedir}/sm/watchdog/modules/libsm_watchdog_nfs.so
%changelog

View File

@ -6,7 +6,7 @@ install:
install -d $(DEST_DIR)$(UNIT_DIR)
install -m 644 *.service $(DEST_DIR)$(UNIT_DIR)
install -d $(DEST_DIR)/etc/init.d
install sm-watchdog sm-eru $(DEST_DIR)/etc/init.d
install sm-eru $(DEST_DIR)/etc/init.d
install -d $(DEST_DIR)/etc/pmon.d
install *.conf $(DEST_DIR)/etc/pmon.d

View File

@ -1,131 +0,0 @@
#! /bin/sh
#
# Copyright (c) 2014 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
# chkconfig: - 87 87
# processname: sm-watchdog
# description: Service Management Watchdog
#
### BEGIN INIT INFO
# Description: sm-watchdog
#
# Short-Description: Service Management Watchdog
# Provides: sm-watchdog
# Required-Start: $network
# Should-Start: $syslog
# Required-Stop: $network
# Default-Start: 3 5
# Default-Stop: 0 6
### END INIT INFO
. /etc/init.d/functions
RETVAL=0
SM_WATCHDOG_NAME="sm-watchdog"
SM_WATCHDOG="/usr/bin/${SM_WATCHDOG_NAME}"
SM_WATCHDOG_PIDFILE="/var/run/${SM_WATCHDOG_NAME}.pid"
if [ ! -e "${SM_WATCHDOG}" ]
then
logger "${SM_WATCHDOG} is missing"
exit 5
fi
PATH=/sbin:/usr/sbin:/bin:/usr/bin:/usr/local/bin
case "$1" in
start)
echo -n "Starting ${SM_WATCHDOG_NAME}: "
if [ -n "`pidof ${SM_WATCHDOG}`" ]
then
# PMOND might have restarted SM-WATCHDOG already.
RETVAL=0
else
start-stop-daemon --start -b -x ${SM_WATCHDOG}
RETVAL=$?
fi
if [ ${RETVAL} -eq 0 ]
then
echo "OK"
else
echo "FAIL"
RETVAL=1
fi
;;
stop)
echo -n "Stopping ${SM_WATCHDOG_NAME}: "
if [ -n "`pidof ${SM_WATCHDOG}`" ]
then
killproc ${SM_WATCHDOG}
fi
SHUTDOWN_TIMEOUT=5
count=0
while [ ${count} -lt ${SHUTDOWN_TIMEOUT} ]
do
pidof ${SM_WATCHDOG} &> /dev/null
rc=$?
if [ ${rc} -eq 1 ]
then
echo "OK"
break
fi
count=`expr ${count} + 1`
sleep 1
done
pidof ${SM_WATCHDOG} &> /dev/null
rc=$?
if [ ${rc} -eq 0 ]
then
echo "FAIL"
RETVAL=7
fi
rm -f ${SM_WATCHDOG_PIDFILE}
;;
status)
pid=`cat ${SM_WATCHDOG_PIDFILE} 2>/dev/null`
if [ -n "${pid}" ]
then
if ps -p ${pid} &>/dev/null
then
echo "${SM_WATCHDOG_NAME} is running"
RETVAL=0
else
echo "${SM_WATCHDOG_NAME} is not running but has pid file"
RETVAL=1
fi
else
echo "${SM_WATCHDOG_NAME} is not running"
RETVAL=3
fi
;;
restart)
$0 stop
sleep 1
$0 start
;;
reload)
echo "${SM_WATCHDOG_NAME} reload"
$0 restart
;;
force-reload)
echo "${SM_WATCHDOG_NAME} force-reload"
$0 restart
;;
*)
echo "usage: $0 { start | stop | status | restart | reload | force-reload }"
;;
esac
exit ${RETVAL}

View File

@ -1,15 +0,0 @@
;
; Copyright (c) 2014 Wind River Systems, Inc.
;
; SPDX-License-Identifier: Apache-2.0
;
[process]
process = sm-watchdog
pidfile = /var/run/sm-watchdog.pid
script = /etc/init.d/sm-watchdog
style = lsb ; lsb
severity = major ; minor, major, critical
restarts = 3 ; restarts before error assertion
startuptime = 5 ; seconds to wait after process start
interval = 5 ; number of seconds to wait between restarts
debounce = 20 ; number of seconds to wait before degrade clear

View File

@ -1,15 +0,0 @@
[Unit]
Description=Service Management Watchdog
After=network-online.target syslog-ng.service config.service
Before=sm.service pmon.service
[Service]
Type=forking
RemainAfterExit=yes
User=root
ExecStart=/etc/init.d/sm-watchdog start
ExecStop=/etc/init.d/sm-watchdog stop
PIDFile=/var/run/sm-watchdog.pid
[Install]
WantedBy=multi-user.target

View File

@ -34,7 +34,7 @@ EXTRACCFLAGS+= -Wformat -Wformat-security
LDLIBS= -lsqlite3 -lglib-2.0 -lgmodule-2.0 -luuid -lrt -lpthread
LDFLAGS = -shared -rdynamic
build: libsm_common.so libsm_watchdog_nfs.so sm_watchdog sm_eru sm_eru_dump
build: libsm_common.so sm_eru sm_eru_dump
.c.o:
$(CXX) $(INCLUDES) $(CCFLAGS) $(EXTRACCFLAGS) -c $< -o $@
@ -48,18 +48,6 @@ libsm_common.so.$(VER_MJR): libsm_common.so.$(VER)
libsm_common.so.$(VER): ${OBJS}
$(CXX) ${LDFLAGS} -Wl,--start-group $(LDLIBS) -Wl,-soname,libsm_common.so.$(VER_MJR) -o $@ $^
libsm_watchdog_nfs.so: libsm_watchdog_nfs.so.$(VER_MJR)
ln -sf $^ $@
libsm_watchdog_nfs.so.$(VER_MJR): libsm_watchdog_nfs.so.$(VER)
ln -sf $^ $@
libsm_watchdog_nfs.so.$(VER): libsm_common.so.$(VER) libsm_common.so
$(CXX) $(INCLUDES) $(CCFLAGS) $(EXTRACCFLAGS) sm_watchdog_nfs.c ${LDFLAGS} $(LDLIBS) -L./ -lsm_common -Wl,-soname,libsm_watchdog_nfs.so.$(VER_MJR) -o $@
sm_watchdog: libsm_common.so
$(CXX) $(INCLUDES) $(CCFLAGS) $(EXTRACCFLAGS) $(OBJS) sm_watchdog_module.c sm_watchdog_process.c sm_watchdog_main.c $(LDLIBS) -L./ -lsm_common -o sm_watchdog
sm_eru: libsm_common.so
$(CXX) $(INCLUDES) $(CCFLAGS) $(EXTRACCFLAGS) $(OBJS) sm_eru_process.c sm_eru_main.c $(LDLIBS) -L./ -lsm_common -o sm_eru
@ -71,15 +59,12 @@ install:
# renamed with '-' like they are in the bitbake file.
#
# install -d $(DEST_DIR)$(BIN_DIR)
# install sm_watchdog sm_eru sm_eru_dump $(DEST_DIR)$(BIN_DIR)
# install sm_eru sm_eru_dump $(DEST_DIR)$(BIN_DIR)
install -d $(DEST_DIR)$(LIB_DIR)
install libsm_common.so.${VER} $(DEST_DIR)$(LIB_DIR)
cp -P libsm_common.so libsm_common.so.$(VER_MJR) $(DEST_DIR)$(LIB_DIR)
install -d $(DEST_DIR)$(INC_DIR)
install -m 644 *.h $(DEST_DIR)$(INC_DIR)
install -d $(DEST_DIR)/var/lib/sm/watchdog/modules
install libsm_watchdog_nfs.so.${VER} $(DEST_DIR)/var/lib/sm/watchdog/modules
cp -P libsm_watchdog_nfs.so libsm_watchdog_nfs.so.${VER_MJR} $(DEST_DIR)/var/lib/sm/watchdog/modules
clean:
rm -f *.o *.so *.so.*

View File

@ -77,15 +77,12 @@ extern "C" {
#define SM_PROCESS_PID_FILENAME "/var/run/sm.pid"
#define SM_TRAP_PROCESS_PID_FILENAME "/var/run/sm-trap.pid"
#define SM_WATCHDOG_PROCESS_PID_FILENAME "/var/run/sm-watchdog.pid"
#define SM_ERU_PROCESS_PID_FILENAME "/var/run/sm-eru.pid"
#define SM_BOOT_COMPLETE_FILENAME "/var/run/sm_boot_complete"
#define SM_INDICATE_DEGRADED_FILENAME "/var/run/.sm_degraded"
#define SM_WATCHDOG_HEARTBEAT_FILENAME "/var/run/.sm_watchdog_heartbeat"
#define SM_DUMP_DATA_FILE "/tmp/sm_data_dump.txt"
#define SM_TROUBLESHOOT_LOG_FILE "/var/log/sm-troubleshoot.log"

View File

@ -15,9 +15,6 @@
#include <fcntl.h>
#include <errno.h>
#include <signal.h>
#include <time.h>
#include <utime.h>
#include <sys/types.h>
#include <sys/stat.h>
// ****************************************************************************
@ -127,77 +124,3 @@ SmErrorT sm_utils_clear_degraded( void )
return( SM_OKAY );
}
// ****************************************************************************
// ****************************************************************************
// Utils - Watchdog Heartbeat
// ==========================
void sm_utils_watchdog_heartbeat( void )
{
struct utimbuf file_times;
struct timespec ts_mono;
clock_gettime( CLOCK_MONOTONIC_RAW, &ts_mono );
memset( &file_times, 0, sizeof(struct utimbuf) );
file_times.actime = ts_mono.tv_sec;
file_times.modtime = ts_mono.tv_sec;
if( 0 > access( SM_WATCHDOG_HEARTBEAT_FILENAME, F_OK ) )
{
int fd = open( SM_WATCHDOG_HEARTBEAT_FILENAME, O_RDWR | O_CREAT,
S_IRUSR | S_IRGRP | S_IROTH | O_CLOEXEC );
if( 0 > fd )
{
DPRINTFE( "Failed to create/open watchdog heartbeat, error=%s.",
strerror(errno) );
return;
}
close( fd );
}
if( 0 > utime( SM_WATCHDOG_HEARTBEAT_FILENAME, &file_times ) )
{
DPRINTFE( "Failed to update watchdog heartbeat timings, error=%s.",
strerror(errno) );
return;
}
}
// ****************************************************************************
// ****************************************************************************
// Utils - Watchdog Delayed
// =========================
bool sm_utils_watchdog_delayed( int max_delay_secs )
{
struct stat stat_data;
if( 0 == access( SM_WATCHDOG_HEARTBEAT_FILENAME, F_OK ) )
{
int elapsed_secs;
struct timespec ts_mono;
clock_gettime( CLOCK_MONOTONIC_RAW, &ts_mono );
if( 0 > stat( SM_WATCHDOG_HEARTBEAT_FILENAME, &stat_data ) )
{
DPRINTFE( "Stat failed on file (%s), error=%s.",
SM_WATCHDOG_HEARTBEAT_FILENAME, strerror( errno ) );
return( false );
}
// Make sure that the elapsed seconds drift is in a valid range.
elapsed_secs = ts_mono.tv_sec - stat_data.st_mtime;
if(( max_delay_secs < elapsed_secs )&&( elapsed_secs <= 300 ))
{
DPRINTFI( "SM-Watchdog has been delayed by more than %d "
"seconds, elapsed_secs=%d", max_delay_secs,
elapsed_secs );
return( true );
}
}
return( false );
}
// ****************************************************************************

View File

@ -50,18 +50,6 @@ extern SmErrorT sm_utils_indicate_degraded( void );
extern SmErrorT sm_utils_clear_degraded( void );
// ****************************************************************************
// ****************************************************************************
// Utils - Watchdog Heartbeat
// ==========================
extern void sm_utils_watchdog_heartbeat( void );
// ****************************************************************************
// ****************************************************************************
// Utils - Watchdog Delayed
// =========================
extern bool sm_utils_watchdog_delayed( int max_delay_secs );
// ****************************************************************************
#ifdef __cplusplus
}
#endif

View File

@ -1,49 +0,0 @@
//
// Copyright (c) 2014 Wind River Systems, Inc.
//
// SPDX-License-Identifier: Apache-2.0
//
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <libgen.h>
#include <errno.h>
#include <sys/stat.h>
#include <sys/types.h>
#include "sm_types.h"
#include "sm_debug.h"
#include "sm_watchdog_process.h"
// ****************************************************************************
// Main - Thread
// =============
int main( int argc, char *argv[], char *envp[] )
{
SmErrorT error;
error = sm_debug_initialize();
if( SM_OKAY != error )
{
printf( "Debug initialization failed, error=%s.\n",
sm_error_str( error ) );
return( EXIT_FAILURE );
}
error = sm_watchdog_process_main( argc, argv, envp );
if( SM_OKAY != error )
{
printf( "Process failure, error=%s.\n", sm_error_str( error ) );
return( EXIT_FAILURE );
}
error = sm_debug_finalize();
if( SM_OKAY != error )
{
printf( "Debug finalization failed, error=%s.\n",
sm_error_str( error ) );
}
return( EXIT_SUCCESS );
}
// ****************************************************************************

View File

@ -1,247 +0,0 @@
//
// Copyright (c) 2014 Wind River Systems, Inc.
//
// SPDX-License-Identifier: Apache-2.0
//
#include "sm_watchdog_module.h"
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <glib.h>
#include <gmodule.h>
#include "sm_types.h"
#include "sm_list.h"
#include "sm_timer.h"
#include "sm_debug.h"
#define SM_WATCHDOG_MODULE_FILENAME_MAX_SIZE 128
#define SM_WATCHDOG_MODULE_PATH "/var/lib/sm/watchdog/modules"
#define SM_WATCHDOG_MODULE_DO_CHECK_FUNC "sm_watchdog_module_do_check"
#define SM_WATCHDOG_MODULE_INITIALIZE_FUNC "sm_watchdog_module_initialize"
#define SM_WATCHDOG_MODULE_FINALIZE_FUNC "sm_watchdog_module_finalize"
typedef void (*SmWatchdogModuleDoCheckT) (void);
typedef bool (*SmWatchdogModuleInitializeT) (int* do_check_in_ms);
typedef bool (*SmWatchdogModuleFinalizeT) (void);
typedef struct
{
gchar filename[SM_WATCHDOG_MODULE_FILENAME_MAX_SIZE];
GModule* glibmod;
int do_check_in_ms;
SmTimerIdT do_check_timer_id;
SmWatchdogModuleDoCheckT do_check;
SmWatchdogModuleInitializeT initialize;
SmWatchdogModuleFinalizeT finalize;
} SmWatchdogModuleT;
static SmListT* _modules = NULL;
// ****************************************************************************
// Watchdog Module - Do Check Timer
// ================================
static bool sm_watchdog_module_do_check_timer( SmTimerIdT timer_id,
int64_t user_data )
{
SmListT* entry = NULL;
SmListEntryDataPtrT entry_data;
SmWatchdogModuleT* module = NULL;
SM_LIST_FOREACH( _modules, entry, entry_data )
{
module = (SmWatchdogModuleT*) entry_data;
if( NULL == module )
{
continue;
}
if( timer_id == module->do_check_timer_id )
{
DPRINTFD( "Found do-check timer for module (%s).",
g_module_name(module->glibmod) );
break;
}
}
if( NULL != module )
{
if( NULL != module->do_check )
{
DPRINTFD( "Calling do-check for module (%s).",
g_module_name(module->glibmod) );
module->do_check();
return( true );
}
} else {
DPRINTFE( "Module not found for do-check timer." );
}
return( false );
}
// ****************************************************************************
// ***************************************************************************
// Watchdog Module - Load
// ======================
static SmErrorT sm_watchdog_module_load( const gchar* filename )
{
gchar* filepath;
SmWatchdogModuleT* module;
module = (SmWatchdogModuleT*) malloc( sizeof(SmWatchdogModuleT) );
if( NULL == module )
{
DPRINTFE( "Failed to allocate watchdog module." );
return( SM_FAILED );
}
memset( module, 0, sizeof(SmWatchdogModuleT) );
g_snprintf(module->filename, SM_WATCHDOG_MODULE_FILENAME_MAX_SIZE,
"%s", filename);
filepath = g_module_build_path( SM_WATCHDOG_MODULE_PATH, filename );
module->glibmod = g_module_open( filepath, G_MODULE_BIND_LAZY );
if( NULL == module->glibmod )
{
DPRINTFE( "Failed to open module (%s).", filepath );
free( module );
g_free( filepath );
return( SM_FAILED );
}
g_free( filepath );
g_module_symbol( module->glibmod, SM_WATCHDOG_MODULE_INITIALIZE_FUNC,
(gpointer*) &(module->initialize) );
g_module_symbol( module->glibmod, SM_WATCHDOG_MODULE_FINALIZE_FUNC,
(gpointer*) &(module->finalize) );
g_module_symbol( module->glibmod, SM_WATCHDOG_MODULE_DO_CHECK_FUNC,
(gpointer*) &(module->do_check) );
SM_LIST_PREPEND( _modules, (SmListEntryDataPtrT) module );
return( SM_OKAY );
}
// ***************************************************************************
// ***************************************************************************
// Watchdog Module - Load All
// ==========================
SmErrorT sm_watchdog_module_load_all( void )
{
const gchar* file;
GDir* directory;
GError* g_error;
SmListT* entry = NULL;
SmListEntryDataPtrT entry_data;
SmWatchdogModuleT* module;
SmErrorT error;
directory = g_dir_open( SM_WATCHDOG_MODULE_PATH, 0, &g_error );
if( NULL == directory )
{
DPRINTFE( "Failed to open directory( %s), error=%s",
SM_WATCHDOG_MODULE_PATH, g_error->message );
g_error_free( g_error );
return( SM_FAILED );
}
file = g_dir_read_name( directory );
while( NULL != file )
{
DPRINTFI( "Loading module (%s).", file );
error = sm_watchdog_module_load( file );
if( SM_OKAY != error )
{
DPRINTFE( "Failed to load module (%s), error=%s.",
file, sm_error_str(error) );
}
file = g_dir_read_name( directory );
}
g_dir_close( directory );
SM_LIST_FOREACH( _modules, entry, entry_data )
{
module = (SmWatchdogModuleT*) entry_data;
if( NULL == module )
{
continue;
}
if( NULL != module->initialize )
{
DPRINTFI( "Initializing module (%s).",
g_module_name(module->glibmod) );
if( !(module->initialize( &(module->do_check_in_ms) )) )
{
DPRINTFE( "Failed to initialize %s.",
g_module_name(module->glibmod) );
return( SM_FAILED );
}
error = sm_timer_register( module->filename,
module->do_check_in_ms,
sm_watchdog_module_do_check_timer,
0, &(module->do_check_timer_id) );
if( SM_OKAY != error )
{
DPRINTFE( "Failed to create module (%s) do-check timer, "
"error=%s.", g_module_name(module->glibmod),
sm_error_str( error ) );
return( error );
}
}
}
return( SM_OKAY );
}
// ***************************************************************************
// ***************************************************************************
// Watchdog Module - Unload All
// ============================
SmErrorT sm_watchdog_module_unload_all( void )
{
SmListT* entry = NULL;
SmListEntryDataPtrT entry_data;
SmWatchdogModuleT* module;
SM_LIST_FOREACH( _modules, entry, entry_data )
{
module = (SmWatchdogModuleT*) entry_data;
if( NULL == module )
{
continue;
}
if( NULL != module->finalize )
{
DPRINTFI( "Finalizing module (%s).",
g_module_name(module->glibmod) );
if( !(module->finalize()) )
{
DPRINTFE( "Failed to finalize %s.",
g_module_name(module->glibmod) );
}
}
g_module_close( module->glibmod );
}
SM_LIST_CLEANUP_ALL( _modules );
return( SM_OKAY );
}
// ***************************************************************************

View File

@ -1,31 +0,0 @@
//
// Copyright (c) 2014 Wind River Systems, Inc.
//
// SPDX-License-Identifier: Apache-2.0
//
#ifndef __SM_WATCHDOG_MODULE_H__
#define __SM_WATCHDOG_MODULE_H__
#include "sm_types.h"
#ifdef __cplusplus
extern "C" {
#endif
// ****************************************************************************
// Watchdog Module - Load All
// ==========================
extern SmErrorT sm_watchdog_module_load_all( void );
// ****************************************************************************
// ****************************************************************************
// Watchdog Module - Unload All
// ============================
extern SmErrorT sm_watchdog_module_unload_all( void );
// ****************************************************************************
#ifdef __cplusplus
}
#endif
#endif // __SM_WATCHDOG_MODULE_H__

View File

@ -1,608 +0,0 @@
//
// Copyright (c) 2014 Wind River Systems, Inc.
//
// SPDX-License-Identifier: Apache-2.0
//
#include "sm_watchdog_nfs.h"
#include <stdint.h>
#include <stdio.h>
#include <stdbool.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <unistd.h>
#include <fcntl.h>
#include <signal.h>
#include <errno.h>
#include <sched.h>
#include <pthread.h>
#include <dirent.h>
#include <limits.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/time.h>
#include <sys/resource.h>
#include "sm_types.h"
#include "sm_time.h"
#include "sm_debug.h"
#include "sm_node_utils.h"
#include "sm_node_stats.h"
#define SM_WATCHDOG_NFS_THREAD_NAME "(nfsd)"
#define SM_WATCHDOG_NFS_REBOOT_INPROGRESS 0xA5A5A5A5
#define SM_WATCHDOG_NFS_MAX_BLOCKED_THREADS 32
#define SM_WATCHDOG_NFS_CHECK_IN_MS 10000
#define SM_WATCHDOG_NFS_MAX_UNINTERRUPTIBLE_SLEEP 60000
#define SM_WATCHDOG_NFS_DELAY_REBOOT_IN_MS 60000
#define SM_WATCHDOG_NFS_DELAY_REBOOT_FORCE_IN_MS 480000
#define SM_WATCHDOG_NFS_DEBUG_FILE "/var/log/nfs.debug"
typedef struct
{
bool inuse;
bool stale;
int pid;
SmTimeT timestamp;
SmNodeProcessStatusT status;
} SmWatchDogNfsBlockedInfoT;
static uint32_t _nfs_reboot_inprogress;
static SmWatchDogNfsBlockedInfoT
_nfs_blocked_threads[SM_WATCHDOG_NFS_MAX_BLOCKED_THREADS];
// ****************************************************************************
// Watchdog NFS - Find Blocked Thread
// ==================================
static SmWatchDogNfsBlockedInfoT* sm_watchdog_nfs_find_blocked_thread( int pid )
{
SmWatchDogNfsBlockedInfoT* entry;
int thread_i;
for( thread_i=0; SM_WATCHDOG_NFS_MAX_BLOCKED_THREADS > thread_i;
++thread_i )
{
entry = &(_nfs_blocked_threads[thread_i]);
if( entry->inuse )
{
if( pid == entry->pid )
{
return( entry );
}
}
}
return( NULL );
}
// ****************************************************************************
// ****************************************************************************
// Watchdog NFS - Add Blocked Thread
// =================================
static void sm_watchdog_nfs_add_blocked_thread( int pid,
SmNodeProcessStatusT* status )
{
SmWatchDogNfsBlockedInfoT* entry;
int thread_i;
for( thread_i=0; SM_WATCHDOG_NFS_MAX_BLOCKED_THREADS > thread_i;
++thread_i )
{
entry = &(_nfs_blocked_threads[thread_i]);
if( !(entry->inuse) )
{
entry->inuse = true;
entry->stale = false;
entry->pid = pid;
sm_time_get( &(entry->timestamp) );
memcpy( &(entry->status), status, sizeof(SmNodeProcessStatusT) );
return;
}
}
DPRINTFE( "Not enough room for all the NFS blocked threads." );
}
// ****************************************************************************
// ****************************************************************************
// Watchdog NFS - Delete Blocked Thread
// ====================================
static void sm_watchdog_nfs_delete_blocked_thread( int pid )
{
SmWatchDogNfsBlockedInfoT* entry;
entry = sm_watchdog_nfs_find_blocked_thread( pid );
if( NULL != entry )
{
memset( entry, 0, sizeof(SmWatchDogNfsBlockedInfoT) );
entry->inuse = false;
}
}
// ****************************************************************************
// ****************************************************************************
// Watchdog NFS - Do Reboot
// ========================
static void sm_watchdog_nfs_do_reboot( void )
{
char cmd[2048];
pid_t reboot_pid;
pid_t reboot_force_pid;
pid_t sm_troubleshoot_pid;
pid_t collect_pid;
SmWatchDogNfsBlockedInfoT* entry;
SmErrorT error;
if( SM_WATCHDOG_NFS_REBOOT_INPROGRESS == _nfs_reboot_inprogress )
{
DPRINTFD( "Reboot already inprogress." );
return;
}
// Fork child to do the reboot.
reboot_pid = fork();
if( 0 > reboot_pid )
{
DPRINTFE( "Failed to fork process for reboot, error=%s.",
strerror( errno ) );
return;
} else if( 0 == reboot_pid ) {
// Child process.
long ms_expired;
char reboot_cmd[] = "reboot";
char* reboot_argv[] = {reboot_cmd, NULL};
char* reboot_env[] = {NULL};
struct rlimit file_limits;
SmTimeT timestamp;
setpgid( 0, 0 );
if( 0 == getrlimit( RLIMIT_NOFILE, &file_limits ) )
{
unsigned int fd_i;
for( fd_i=0; fd_i < file_limits.rlim_cur; ++fd_i )
{
close( fd_i );
}
open( "/dev/null", O_RDONLY ); // stdin
open( "/dev/null", O_WRONLY ); // stdout
open( "/dev/null", O_WRONLY ); // stderr
}
sm_time_get( &timestamp );
while( true )
{
ms_expired = sm_time_get_elapsed_ms( &timestamp );
if( SM_WATCHDOG_NFS_DELAY_REBOOT_IN_MS < ms_expired )
{
break;
}
sleep( 10 ); // 10 seconds
}
execve( "/sbin/reboot", reboot_argv, reboot_env );
// Shouldn't get this far, else there was an error.
exit(-1);
}
// Fork child to do reboot force.
reboot_force_pid = fork();
if( 0 > reboot_force_pid )
{
DPRINTFE( "Failed to fork process for reboot escalation, "
"error=%s.", strerror( errno ) );
return;
} else if( 0 == reboot_force_pid ) {
// Child process.
long ms_expired;
int sysrq_handler_fd;
int sysrq_tigger_fd;
struct rlimit file_limits;
SmTimeT timestamp;
setpgid( 0, 0 );
if( 0 == getrlimit( RLIMIT_NOFILE, &file_limits ) )
{
unsigned int fd_i;
for( fd_i=0; fd_i < file_limits.rlim_cur; ++fd_i )
{
close( fd_i );
}
open( "/dev/null", O_RDONLY ); // stdin
open( "/dev/null", O_WRONLY ); // stdout
open( "/dev/null", O_WRONLY ); // stderr
}
sm_time_get( &timestamp );
while( true )
{
ms_expired = sm_time_get_elapsed_ms( &timestamp );
if( SM_WATCHDOG_NFS_DELAY_REBOOT_FORCE_IN_MS < ms_expired )
{
break;
}
sleep( 10 ); // 10 seconds
}
// Enable sysrq handling.
sysrq_handler_fd = open( "/proc/sys/kernel/sysrq", O_RDWR | O_CLOEXEC );
if( 0 > sysrq_handler_fd )
{
return;
}
write( sysrq_handler_fd, "1", 1 );
close( sysrq_handler_fd );
// Trigger sysrq command.
sysrq_tigger_fd = open( "/proc/sysrq-trigger", O_RDWR | O_CLOEXEC );
if( 0 > sysrq_tigger_fd )
{
return;
}
write( sysrq_tigger_fd, "b", 1 );
close( sysrq_tigger_fd );
exit( EXIT_SUCCESS );
}
_nfs_reboot_inprogress = SM_WATCHDOG_NFS_REBOOT_INPROGRESS;
// Fork child to do the sm-troubleshoot.
sm_troubleshoot_pid = fork();
if( 0 > sm_troubleshoot_pid )
{
DPRINTFE( "Failed to fork process for sm-trouble, error=%s.",
strerror( errno ) );
} else if( 0 == sm_troubleshoot_pid ) {
// Child process.
char cmd[] = "sm-troubleshoot";
char log_file[] = SM_TROUBLESHOOT_LOG_FILE;
char* argv[] = {cmd, log_file, NULL};
char* env[] = {NULL};
struct rlimit file_limits;
setpgid( 0, 0 );
if( 0 == getrlimit( RLIMIT_NOFILE, &file_limits ) )
{
unsigned int fd_i;
for( fd_i=0; fd_i < file_limits.rlim_cur; ++fd_i )
{
close( fd_i );
}
open( "/dev/null", O_RDONLY ); // stdin
open( "/dev/null", O_WRONLY ); // stdout
open( "/dev/null", O_WRONLY ); // stderr
}
execve( SM_TROUBLESHOOT_SCRIPT, argv, env );
// Shouldn't get this far, else there was an error.
exit(-1);
}
// Fork child to run collect.
collect_pid = fork();
if( 0 > collect_pid )
{
DPRINTFE( "Failed to fork process for collect, error=%s.",
strerror( errno ) );
} else if( 0 == collect_pid ) {
// Child process.
char cmd[] = "collect";
char* argv[] = {cmd, NULL};
char* env[] = {NULL};
struct rlimit file_limits;
setpgid( 0, 0 );
if( 0 == getrlimit( RLIMIT_NOFILE, &file_limits ) )
{
unsigned int fd_i;
for( fd_i=0; fd_i < file_limits.rlim_cur; ++fd_i )
{
close( fd_i );
}
open( "/dev/null", O_RDONLY ); // stdin
open( "/dev/null", O_WRONLY ); // stdout
open( "/dev/null", O_WRONLY ); // stderr
}
execve( "/usr/local/sbin/collect", argv, env );
// Shouldn't get this far, else there was an error.
exit(-1);
}
error = sm_node_utils_set_unhealthy();
if( SM_OKAY != error )
{
DPRINTFE( "Failed to set node unhealthy, error=%s.",
sm_error_str(error) );
}
DPRINTFI( "*******************************************************" );
DPRINTFI( "** Issuing a reboot of the system, NFS hang detected **" );
DPRINTFI( "*******************************************************" );
DPRINTFI( "Reboot (%i) process created.", (int) reboot_pid );
DPRINTFI( "Reboot force (%i) process created.", (int) reboot_force_pid );
DPRINTFI( "SM troubleshoot (%i) process created.", (int) sm_troubleshoot_pid );
DPRINTFI( "Collect (%i) process created.", (int) collect_pid );
snprintf( cmd, sizeof(cmd),
"date >> %s; "
"echo \"*******************************************\" >> %s; "
"echo \"NFS HANG DETECTED\" >> %s", SM_WATCHDOG_NFS_DEBUG_FILE,
SM_WATCHDOG_NFS_DEBUG_FILE, SM_WATCHDOG_NFS_DEBUG_FILE );
system( cmd );
int thread_i;
for( thread_i=0; SM_WATCHDOG_NFS_MAX_BLOCKED_THREADS > thread_i;
++thread_i )
{
entry = &(_nfs_blocked_threads[thread_i]);
if( entry->inuse )
{
snprintf( cmd, sizeof(cmd),
"date >> %s; "
"echo \"cat /proc/%i/sched\" >> %s; "
"cat /proc/%i/sched >> %s", SM_WATCHDOG_NFS_DEBUG_FILE,
entry->pid, SM_WATCHDOG_NFS_DEBUG_FILE, entry->pid,
SM_WATCHDOG_NFS_DEBUG_FILE );
system( cmd );
snprintf( cmd, sizeof(cmd),
"date >> %s; "
"echo \"cat /proc/%i/stack\" >> %s; "
"cat /proc/%i/stack >> %s", SM_WATCHDOG_NFS_DEBUG_FILE,
entry->pid, SM_WATCHDOG_NFS_DEBUG_FILE, entry->pid,
SM_WATCHDOG_NFS_DEBUG_FILE );
system( cmd );
}
}
snprintf( cmd, sizeof(cmd),
"echo \"*******************************************\" >> %s",
SM_WATCHDOG_NFS_DEBUG_FILE );
system( cmd );
}
// ****************************************************************************
// ****************************************************************************
// Watchdog NFS - Search
// =====================
static void sm_watchdog_nfs_search( const char dir_name[] )
{
bool is_dir;
DIR* dir;
char path[PATH_MAX];
int path_len;
SmNodeProcessStatusT status;
SmErrorT error;
dir = opendir( dir_name );
if( NULL == dir )
{
DPRINTFE( "Failed to open directory (%s), error=%s.", dir_name,
strerror( errno ) );
return;
}
struct dirent* entry;
for( entry = readdir( dir ); NULL != entry; entry = readdir( dir ) )
{
is_dir = false;
path_len = snprintf( path, sizeof(path), "%s/%s", dir_name,
entry->d_name );
if( PATH_MAX <= path_len )
{
DPRINTFE( "Path (%s/%s) is too long, max_len=%i.",
dir_name, entry->d_name, path_len );
break;
}
if( 0 != (DT_REG & entry->d_type) )
{
if( '.' != entry->d_name[0] )
{
struct stat stat_data;
if( 0 > lstat( path, &stat_data ) )
{
DPRINTFE( "Stat on (%s) failed, error=%s.", entry->d_name,
strerror( errno ) );
continue;
}
is_dir = S_ISDIR( stat_data.st_mode );
}
} else if( 0 != (DT_DIR & entry->d_type) ) {
if(( 0 != strcmp( ".", entry->d_name ) )&&
( 0 != strcmp( "..", entry->d_name ) ))
{
is_dir = true;
}
}
if( is_dir )
{
long val;
char* end;
val = strtol( entry->d_name, &end, 10 );
if(( ERANGE == errno )&&
(( LONG_MIN == val ) ||( LONG_MAX == val )))
{
DPRINTFD( "Directory (%s) name out of range.",
entry->d_name );
continue;
}
if( end == entry->d_name )
{
DPRINTFD( "Directory (%s) is not a pid directory.",
entry->d_name );
continue;
}
error = sm_node_stats_get_process_status( val, &status );
if( SM_OKAY != error )
{
if( SM_NOT_FOUND == error )
{
DPRINTFD( "Failed to get %ld pid status, error=%s.",
val, sm_error_str(error) );
} else {
DPRINTFE( "Failed to get %ld pid status, error=%s.",
val, sm_error_str(error) );
}
continue;
}
DPRINTFD( "Looking at pid=%i, name=%s", status.pid, status.name );
if( 0 != strcmp( SM_WATCHDOG_NFS_THREAD_NAME, status.name ) )
{
DPRINTFD( "Process (%s) not an nfs thread, pid=%i.",
status.name, status.pid );
continue;
}
DPRINTFD( "NFS thread, pid=%i, state=%c, block_start_ns=%lld.",
status.pid, status.state, status.block_start_ns );
if(( 0 != status.block_start_ns )&&( 'D' == status.state ))
{
SmWatchDogNfsBlockedInfoT* entry;
entry = sm_watchdog_nfs_find_blocked_thread( (int) val );
if( NULL == entry )
{
sm_watchdog_nfs_add_blocked_thread( (int) val, &status );
} else if( status.block_start_ns == entry->status.block_start_ns ) {
long ms_expired;
entry->stale = false;
ms_expired = sm_time_get_elapsed_ms( &(entry->timestamp) );
if( SM_WATCHDOG_NFS_MAX_UNINTERRUPTIBLE_SLEEP < ms_expired )
{
sm_watchdog_nfs_do_reboot();
DPRINTFI( "Rebooting stuck nfs thread (%i).",
(int) val );
break;
} else {
if( (SM_WATCHDOG_NFS_MAX_UNINTERRUPTIBLE_SLEEP/2)
< ms_expired )
{
DPRINTFI( "WARNING: NFS thread, pid=%i, state=%c, "
"block_start_ns=%lld, elapsed_ms=%ld.",
status.pid, status.state,
status.block_start_ns, ms_expired );
}
}
} else {
sm_watchdog_nfs_delete_blocked_thread( (int) val );
sm_watchdog_nfs_add_blocked_thread( (int) val, &status );
}
} else {
sm_watchdog_nfs_delete_blocked_thread( (int) val );
}
}
}
closedir( dir );
}
// ****************************************************************************
// ****************************************************************************
// Watchdog NFS - Do Check
// =======================
void sm_watchdog_module_do_check( void )
{
DPRINTFD( "NFS do check called." );
if( SM_WATCHDOG_NFS_REBOOT_INPROGRESS != _nfs_reboot_inprogress )
{
int thread_i;
SmWatchDogNfsBlockedInfoT* entry;
// Mark entries as stale.
for( thread_i=0; SM_WATCHDOG_NFS_MAX_BLOCKED_THREADS > thread_i;
++thread_i )
{
entry = &(_nfs_blocked_threads[thread_i]);
if( entry->inuse )
{
entry->stale = true;
}
}
// Audit NFS threads.
sm_watchdog_nfs_search( "/proc" );
// Cleanup stale entries.
for( thread_i=0; SM_WATCHDOG_NFS_MAX_BLOCKED_THREADS > thread_i;
++thread_i )
{
entry = &(_nfs_blocked_threads[thread_i]);
if(( entry->inuse )&&( entry->stale ))
{
memset( entry, 0, sizeof(SmWatchDogNfsBlockedInfoT) );
entry->inuse = false;
}
}
} else {
DPRINTFD( "Reboot inprogress." );
}
}
// ****************************************************************************
// ****************************************************************************
// Watchdog NFS - Initialize
// =========================
bool sm_watchdog_module_initialize( int* do_check_in_ms )
{
*do_check_in_ms = SM_WATCHDOG_NFS_CHECK_IN_MS;
_nfs_reboot_inprogress = 0;
memset( &_nfs_blocked_threads, 0, sizeof(_nfs_blocked_threads) );
return( true );
}
// ****************************************************************************
// ****************************************************************************
// Watchdog NFS - Finalize
// =======================
bool sm_watchdog_module_finalize( void )
{
_nfs_reboot_inprogress = 0;
memset( &_nfs_blocked_threads, 0, sizeof(_nfs_blocked_threads) );
return( true );
}
// ****************************************************************************

View File

@ -1,37 +0,0 @@
//
// Copyright (c) 2014 Wind River Systems, Inc.
//
// SPDX-License-Identifier: Apache-2.0
//
#ifndef __SM_WATCHDOG_NFS_H__
#define __SM_WATCHDOG_NFS_H__
#include <stdbool.h>
#ifdef __cplusplus
extern "C" {
#endif
// ****************************************************************************
// Watchdog NFS - Do Check
// =======================
extern void sm_watchdog_module_do_check( void );
// ****************************************************************************
// ****************************************************************************
// Watchdog NFS - Initialize
// =========================
extern bool sm_watchdog_module_initialize( int* do_check_in_ms );
// ****************************************************************************
// ****************************************************************************
// Watchdog NFS - Finalize
// =======================
extern bool sm_watchdog_module_finalize( void );
// ****************************************************************************
#ifdef __cplusplus
}
#endif
#endif // __SM_WATCHDOG_NFS_H__

View File

@ -1,241 +0,0 @@
//
// Copyright (c) 2014 Wind River Systems, Inc.
//
// SPDX-License-Identifier: Apache-2.0
//
#include "sm_watchdog_process.h"
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#include <signal.h>
#include <fcntl.h>
#include <unistd.h>
#include <time.h>
#include <sched.h>
#include <limits.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/wait.h>
#include <sys/time.h>
#include <sys/syscall.h>
#include <sys/resource.h>
#include <sys/select.h>
#include <getopt.h>
#include "sm_limits.h"
#include "sm_types.h"
#include "sm_debug.h"
#include "sm_utils.h"
#include "sm_selobj.h"
#include "sm_time.h"
#include "sm_timer.h"
#include "sm_node_stats.h"
#include "sm_watchdog_module.h"
#define SM_WATCHDOG_PROCESS_TICK_INTERVAL_IN_MS 1000
static sig_atomic_t _stay_on = 1;
// ****************************************************************************
// Watchdog Process - Signal Handler
// =================================
static void sm_watchdog_process_signal_handler( int signum )
{
switch( signum )
{
case SIGINT:
case SIGTERM:
case SIGQUIT:
_stay_on = 0;
break;
case SIGCONT:
DPRINTFD( "Ignoring signal SIGCONT (%i).", signum );
break;
default:
DPRINTFD( "Signal (%i) ignored.", signum );
break;
}
}
// ****************************************************************************
// ****************************************************************************
// Watchdog Process - Setup Signal Handler
// =======================================
static void sm_watchdog_process_setup_signal_handler( void )
{
struct sigaction sa;
memset( &sa, 0, sizeof(sa) );
sa.sa_handler = sm_watchdog_process_signal_handler;
sigaction( SIGINT, &sa, NULL );
sigaction( SIGTERM, &sa, NULL );
sigaction( SIGQUIT, &sa, NULL );
sigaction( SIGCONT, &sa, NULL );
signal( SIGCHLD, SIG_IGN );
}
// ****************************************************************************
// ****************************************************************************
// Watchdog Process - Initialize
// =============================
static SmErrorT sm_watchdog_process_initialize( void )
{
SmErrorT error;
error = sm_selobj_initialize();
if( SM_OKAY != error )
{
DPRINTFE( "Failed to initialize selection object module, error=%s.",
sm_error_str( error ) );
return( error );
}
error = sm_timer_initialize( SM_WATCHDOG_PROCESS_TICK_INTERVAL_IN_MS );
if( SM_OKAY != error )
{
DPRINTFE( "Failed to initialize timer module, error=%s.",
sm_error_str( error ) );
return( error );
}
error = sm_node_stats_initialize();
if( SM_OKAY != error )
{
DPRINTFE( "Failed to initialize node stats, error=%s.",
sm_error_str( error ) );
return( error );
}
return( SM_OKAY );
}
// ****************************************************************************
// ****************************************************************************
// Watchdog Process - Finalize
// ===========================
static SmErrorT sm_watchdog_process_finalize( void )
{
SmErrorT error;
error = sm_node_stats_finalize();
if( SM_OKAY != error )
{
DPRINTFE( "Failed to finialize node stats, error=%s.",
sm_error_str( error ) );
}
error = sm_timer_finalize();
if( SM_OKAY != error )
{
DPRINTFE( "Failed to finalize timer module, error=%s.",
sm_error_str( error ) );
}
error = sm_selobj_finalize();
if( SM_OKAY != error )
{
DPRINTFE( "Failed to finalize selection object module, error=%s.",
sm_error_str( error ) );
}
return( SM_OKAY );
}
// ****************************************************************************
// ****************************************************************************
// Watchdog Process - Main
// =======================
SmErrorT sm_watchdog_process_main( int argc, char *argv[], char *envp[] )
{
long ms_expired;
SmTimeT watchdog_heartbeat_time_prev;
SmErrorT error;
sm_watchdog_process_setup_signal_handler();
DPRINTFI( "Starting" );
if( sm_utils_process_running( SM_WATCHDOG_PROCESS_PID_FILENAME ) )
{
DPRINTFI( "Already running an instance of sm-watchdog." );
return( SM_OKAY );
}
if( !sm_utils_set_pid_file( SM_WATCHDOG_PROCESS_PID_FILENAME ) )
{
DPRINTFE( "Failed to write pid file for sm-watchdog, error=%s.",
strerror(errno) );
return( SM_FAILED );
}
error = sm_watchdog_process_initialize();
if( SM_OKAY != error )
{
DPRINTFE( "Failed initialize process, error=%s.",
sm_error_str(error) );
return( error );
}
error = sm_watchdog_module_load_all();
if( SM_OKAY != error )
{
DPRINTFE( "Failed load modules, error=%s.",
sm_error_str(error) );
return( error );
}
DPRINTFI( "Started." );
sm_time_get( &watchdog_heartbeat_time_prev );
sm_utils_watchdog_heartbeat();
while( _stay_on )
{
error = sm_selobj_dispatch( SM_WATCHDOG_PROCESS_TICK_INTERVAL_IN_MS );
if( SM_OKAY != error )
{
DPRINTFE( "Selection object dispatch failed, error=%s.",
sm_error_str(error) );
break;
}
ms_expired = sm_time_get_elapsed_ms( &watchdog_heartbeat_time_prev );
if( SM_WATCHDOG_PROCESS_TICK_INTERVAL_IN_MS <= ms_expired )
{
if( sm_timer_scheduling_on_time() )
{
sm_utils_watchdog_heartbeat();
sm_time_get( &watchdog_heartbeat_time_prev );
}
}
}
DPRINTFI( "Shutting down." );
error = sm_watchdog_module_unload_all();
if( SM_OKAY != error )
{
DPRINTFE( "Failed unload modules, error=%s.",
sm_error_str(error) );
}
error = sm_watchdog_process_finalize();
if( SM_OKAY != error )
{
DPRINTFE( "Failed to finalize process, error=%s.",
sm_error_str( error ) );
}
DPRINTFI( "Shutdown complete." );
return( SM_OKAY );
}
// ****************************************************************************

View File

@ -1,25 +0,0 @@
//
// Copyright (c) 2014 Wind River Systems, Inc.
//
// SPDX-License-Identifier: Apache-2.0
//
#ifndef __SM_WATCHDOG_PROCESS_H__
#define __SM_WATCHDOG_PROCESS_H__
#include "sm_types.h"
#ifdef __cplusplus
extern "C" {
#endif
// ****************************************************************************
// Watchdog Process - Main
// =======================
extern SmErrorT sm_watchdog_process_main( int argc, char *argv[], char *envp[] );
// ****************************************************************************
#ifdef __cplusplus
}
#endif
#endif // __SM_WATCHDOG_PROCESS_H__

View File

@ -1,6 +1,6 @@
[Unit]
Description=Service Management Unit
After=network-online.target syslog-ng.service config.service sm-watchdog.service systemd-udev-settle.service drbd.service
After=network-online.target syslog-ng.service config.service systemd-udev-settle.service drbd.service
Before=sm-shutdown.service sm-api.service pmon.service
[Service]

View File

@ -67,9 +67,6 @@ timeout --signal KILL 5s pmap -x `cat /var/run/sm-trap.pid`
delimiter "pmap -x cat /var/run/sm-eru.pid"
timeout --signal KILL 5s pmap -x `cat /var/run/sm-eru.pid`
delimiter "pmap -x cat /var/run/sm-watchdog.pid"
timeout --signal KILL 5s pmap -x `cat /var/run/sm-watchdog.pid`
delimiter "top -b -n 1 -H -c"
timeout --signal KILL 5s top -b -n 1 -H -c

View File

@ -19,14 +19,11 @@
#include <sys/resource.h>
#include "sm_types.h"
#include "sm_utils.h"
#include "sm_debug.h"
#include "sm_sha512.h"
#include "sm_service_action_table.h"
#include "sm_service_action_result_table.h"
#define SM_SERVICE_ACTION_MAX_DELAY_IN_SECS 4
#define SM_SERVICE_ACTION_TIMER_SKEW_IN_MS 60000
#define SM_SERVICE_ACTION_VALIDATE_TIMER_IN_MS 60000
// ****************************************************************************
@ -839,15 +836,6 @@ SmErrorT sm_service_action_run( char service_name[], char instance_name[],
*process_id = (int) pid;
*timeout_in_ms = action_data->timeout_in_secs * 1000;
if( sm_utils_watchdog_delayed( SM_SERVICE_ACTION_MAX_DELAY_IN_SECS ) )
{
DPRINTFI( "Service (%s) timeout %d secs increased by %d ms, "
"sm-watchdog delayed.", action_data->service_name,
action_data->timeout_in_secs,
SM_SERVICE_ACTION_TIMER_SKEW_IN_MS );
*timeout_in_ms += SM_SERVICE_ACTION_TIMER_SKEW_IN_MS;
}
DPRINTFD( "Child process (%i) created for service (%s).", *process_id,
action_data->service_name );
}

View File

@ -40,9 +40,7 @@ typedef struct
SmServiceGroupNotificationT service_group_notification;
} SmNotificationEnvT;
#define SM_NOTIFICATION_SCRIPT_MAX_DELAY_IN_SECS 4
#define SM_NOTIFICATION_SCRIPT_TIMEOUT_IN_MS 30000
#define SM_NOTIFICATION_SCRIPT_TIMER_SKEW_IN_MS 60000
#define SM_NOTIFICATION_SCRIPT_SUCCESS 0
#define SM_NOTIFICATION_SCRIPT_TIMEOUT -65534
#define SM_NOTIFICATION_SCRIPT_FAILURE -65535
@ -712,14 +710,6 @@ SmErrorT sm_service_group_notification_notify( SmServiceGroupT* service_group,
snprintf( timer_name, sizeof(timer_name), "%s %s notification ",
service_group->name, notification_str );
if( sm_utils_watchdog_delayed( SM_NOTIFICATION_SCRIPT_MAX_DELAY_IN_SECS ) )
{
DPRINTFI( "Notification timeout %d secs increased by %d ms, "
"sm-watchdog delayed.", timeout_in_ms,
SM_NOTIFICATION_SCRIPT_TIMER_SKEW_IN_MS );
timeout_in_ms += SM_NOTIFICATION_SCRIPT_TIMER_SKEW_IN_MS;
}
error = sm_timer_register( timer_name, timeout_in_ms,
sm_service_group_notification_timeout,
service_group->id, &timer_id );