
Verified with Multi-node and duplex deploy and log checking Closes-Bug:1794937 Change-Id: I3af74efbf66fe71e4920f6cfd7af2b2ef4cb68b6 Signed-off-by: Martin, Chen <haochuan.z.chen@intel.com>
278 lines
8.6 KiB
C++
278 lines
8.6 KiB
C++
/*
|
|
* Copyright (c) 2015-2016 Wind River Systems, Inc.
|
|
*
|
|
* SPDX-License-Identifier: Apache-2.0
|
|
*
|
|
*/
|
|
|
|
/**
|
|
* @file
|
|
* Wind River CGCS Platform Host Watchdog Service Handler
|
|
*/
|
|
|
|
#include "hostw.h"
|
|
#include <linux/watchdog.h>
|
|
#include <fcntl.h>
|
|
#include <unistd.h> /* for execve */
|
|
|
|
/* In addition to logging to wherever elog messages go,
|
|
* this function does its best to log output to the console
|
|
* (for the purpose of capturing data when the system is
|
|
* about to go down)
|
|
*
|
|
* The path we log to is defined in the config file, exepected to
|
|
* be something like "/dev/console", "/dev/pts/0" or "/dev/ttyS0"
|
|
*/
|
|
#define emergency_log(...) \
|
|
{ \
|
|
daemon_config_type *cfg = daemon_get_cfg_ptr (); \
|
|
elog(__VA_ARGS__) \
|
|
if (cfg->hostwd_console_path) { \
|
|
FILE* console = fopen(cfg->hostwd_console_path, "a"); \
|
|
if (NULL != console) { \
|
|
fprintf(console, __VA_ARGS__); \
|
|
fclose (console); \
|
|
} \
|
|
} \
|
|
}
|
|
|
|
int hostw_service_command ( hostw_socket_type * hostw_socket );
|
|
|
|
static void fork_hostwd_logger ( void );
|
|
|
|
/* Push daemon state to log file */
|
|
void daemon_dump_info ( void )
|
|
{
|
|
}
|
|
|
|
void daemon_sigchld_hdlr ( void )
|
|
{
|
|
}
|
|
|
|
/**
|
|
* This is the main loop of the program
|
|
*
|
|
* We loop waiting for messages to arrive. We're allowed to miss some messages
|
|
* (due to jitter in system scheduling, overcommitted resources, etc) and this
|
|
* can be tuned using the config file. On each loop (regardless of message
|
|
* received) we pet the watchdog.
|
|
*
|
|
* If a large number of messages are missed, or the messages conistently
|
|
* indicate system issues, we take the appropriate action (log what we can
|
|
* and reboot the system).
|
|
*/
|
|
void hostw_service ( void )
|
|
{
|
|
std::list<int> socks ; /* we have a "list" of 1 socket, to allow for
|
|
* future extension and to mirror code flow of
|
|
* other utilities */
|
|
hostw_socket_type * hostw_socket = hostw_getSock_ptr ();
|
|
struct timeval timeout;
|
|
int rc;
|
|
|
|
hostw_ctrl_type *ctrl = get_ctrl_ptr ();
|
|
daemon_config_type *config = daemon_get_cfg_ptr ();
|
|
|
|
ctrl->pmon_grace_loops = config->hostwd_failure_threshold + 1;
|
|
|
|
socks.clear();
|
|
if ( hostw_socket->status_sock )
|
|
{
|
|
socks.push_front (hostw_socket->status_sock);
|
|
FD_SET(hostw_socket->status_sock, &(hostw_socket->readfds));
|
|
}
|
|
socks.sort();
|
|
|
|
ilog("Host Watchdog Service running\n");
|
|
for ( ; ; )
|
|
{
|
|
timeout.tv_sec = config->hostwd_update_period;
|
|
timeout.tv_usec=0;
|
|
|
|
/* pet the watchdog */
|
|
kernel_watchdog_pet();
|
|
|
|
/* set the master fd_set */
|
|
FD_ZERO(&(hostw_socket->readfds));
|
|
FD_SET(hostw_socket->status_sock, &(hostw_socket->readfds));
|
|
|
|
rc = select (socks.back() + 1,
|
|
&(hostw_socket->readfds), NULL, NULL, &timeout);
|
|
|
|
/* If the select time out expired then no new message to process */
|
|
if ( rc < 0 )
|
|
{
|
|
/* Check to see if the select call failed. */
|
|
/* ... but filter Interrupt signal */
|
|
if ( errno != EINTR )
|
|
{
|
|
elog ("Select Failed (rc:%d) %s \n", errno, strerror(errno));
|
|
ctrl->pmon_grace_loops--;
|
|
}
|
|
}
|
|
else if ( rc == 0 )
|
|
{
|
|
if (daemon_is_file_present(NODE_LOCKED_FILE))
|
|
{
|
|
wlog( "Did not receive message from PMON, however node is"
|
|
" locked -- refusing to take reset action while locked\n" );
|
|
}
|
|
else
|
|
{
|
|
ctrl->pmon_grace_loops--;
|
|
|
|
/* we missed a message from PMON */
|
|
if ( ctrl->pmon_grace_loops == (config->hostwd_failure_threshold - 1) )
|
|
{
|
|
ilog ("Did not receive expected message from PMON - first missed message\n");
|
|
}
|
|
ctrl->pmon_grace_loops--;
|
|
if ( ctrl->pmon_grace_loops <= 2 )
|
|
{
|
|
ilog ("Did not receive expected message from PMON - %d more missed messages allowed\n",
|
|
ctrl->pmon_grace_loops);
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if (FD_ISSET(hostw_socket->status_sock, &(hostw_socket->readfds)))
|
|
{
|
|
rc = hostw_service_command ( hostw_socket);
|
|
if ( rc == PASS ) /* got "all is well" message */
|
|
{
|
|
ctrl->pmon_grace_loops = config->hostwd_failure_threshold;
|
|
}
|
|
}
|
|
}
|
|
if (0 >= ctrl->pmon_grace_loops)
|
|
{
|
|
if (daemon_is_file_present(NODE_LOCKED_FILE))
|
|
{
|
|
wlog( "Host watchdog (hostwd) not receiving messages from PMON"
|
|
" however host is locked - refusing to take reset action"
|
|
" while locked\n" );
|
|
}
|
|
else
|
|
{
|
|
emergency_log( "*** Host watchdog (hostwd) not receiving messages "
|
|
"from PMON ***\n");
|
|
hostw_log_and_reboot();
|
|
}
|
|
}
|
|
|
|
daemon_signal_hdlr ();
|
|
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Parse and react to a message from PMON
|
|
*/
|
|
int hostw_service_command ( hostw_socket_type * hostw_socket)
|
|
{
|
|
mtc_message_type msg[2]; /* we use a chunk of memory larger than a single
|
|
* mtc_message_type to check for oversized messages
|
|
* (invalid...)
|
|
*/
|
|
int len = sizeof(msg[0]) + 1;
|
|
|
|
memset(msg, 0, 2*sizeof(msg[0]));
|
|
socklen_t addrlen = (socklen_t) sizeof(hostw_socket->status_addr);
|
|
len = recvfrom(hostw_socket->status_sock,
|
|
(char*)&msg,
|
|
len,
|
|
0,
|
|
(struct sockaddr*) &hostw_socket->status_addr,
|
|
&addrlen);
|
|
|
|
if (sizeof(msg[0]) == len)
|
|
{
|
|
/* message is correct size, check pmon reported status */
|
|
switch (msg[0].cmd)
|
|
{
|
|
case MTC_CMD_NONE:
|
|
/* All is well */
|
|
return PASS;
|
|
|
|
case MTC_EVENT_PMON_CRIT:
|
|
if (daemon_is_file_present(NODE_LOCKED_FILE))
|
|
{
|
|
ilog( "PMON reports unrecoverable system, however node is"
|
|
" locked - considering this an OK message\n" );
|
|
return PASS;
|
|
}
|
|
else
|
|
{
|
|
emergency_log( "*** PMON reports unrecoverable system - message '%s' ***\n", msg[0].buf);
|
|
hostw_log_and_reboot();
|
|
}
|
|
return FAIL;
|
|
|
|
default:
|
|
elog("Unknown status reported\n");
|
|
break;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
/* bad message size */
|
|
elog("Host Watchdog received bad or corrupted message (length = %d)\n", len);
|
|
}
|
|
return FAIL;
|
|
}
|
|
|
|
/**
|
|
* Host watchdog (or PMON) has determined that the system is not healthy and is
|
|
* performing recovery action.
|
|
*/
|
|
void hostw_log_and_reboot()
|
|
{
|
|
daemon_config_type* config = daemon_get_cfg_ptr ();
|
|
|
|
emergency_log ("*** Host Watchdog declaring system unhealthy ***\n");
|
|
|
|
/* start the process to log as much data as possible */
|
|
fork_hostwd_logger ();
|
|
|
|
if (config->hostwd_reboot_on_err) {
|
|
emergency_log ("*** Initiating reboot ***\n");
|
|
|
|
/* start the process that will perform an ungraceful reboot, if
|
|
* the graceful reboot fails */
|
|
fork_sysreq_reboot ( FORCE_REBOOT_DELAY );
|
|
|
|
/* start the graceful reboot process */
|
|
fork_graceful_reboot ( GRACEFUL_REBOOT_DELAY );
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Initiate the thread which logs as much information about the system
|
|
* as possible.
|
|
*/
|
|
static void fork_hostwd_logger ( void )
|
|
{
|
|
int parent = double_fork ();
|
|
if (0 > parent) /* problem forking */
|
|
{
|
|
elog ("failed to fork hostwd logging process\n");
|
|
return ;
|
|
}
|
|
else if (0 == parent) /* if we're the child */
|
|
{
|
|
sigset_t mask , mask_orig ;
|
|
|
|
setup_child(false); /* initialize the process group, etc */
|
|
ilog ("*** Host Watchdog Logging Thread ***\n");
|
|
|
|
sigemptyset (&mask);
|
|
sigaddset (&mask, SIGTERM );
|
|
sigprocmask (SIG_BLOCK, &mask, &mask_orig );
|
|
|
|
/* TODO - log data here */
|
|
exit (0);
|
|
}
|
|
}
|
|
|