metal/mtce/src/hwmon/hwmonHdlr.cpp

2533 lines
108 KiB
C++

/*
* Copyright (c) 2013-2017 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
*/
/**
* @file
* Wind River CGCS Platform Process Monitor Service Handler
*/
#include "daemon_ini.h"
#include "nodeBase.h" /* for ... mtce common definitions */
#include "jsonUtil.h" /* for ... json utilities */
#include "regexUtil.h" /* for ... regexUtil_pattern_match */
#include "tokenUtil.h" /* for ... tokenUtil_new_token */
#include "nodeUtil.h" /* for ... mtce common utilities */
#include "bmcUtil.h" /* for ... mtce-common board management */
#include "hwmon.h" /* for ... service module header */
#include "hwmonUtil.h" /* for ... utilities, ie clear_logged_state */
#include "hwmonClass.h" /* for ... service class definition */
#include "hwmonSensor.h" /* for ... this mpodule header */
#include "hwmonHttp.h" /* for ... hwmonHttp_mod_group */
#include "hwmonAlarm.h" /* for ... hwmonAlarm_major */
#include "hwmonBmc.h" /* for ... QUANTA_SAMPLE_PROFILE_.. */
/* Declare the Hardware Monitor Inventory Object */
hwmonHostClass hostInv ;
/* Public interface to get the Hardware Monitor Inventory object */
hwmonHostClass * get_hwmonHostClass_ptr ( void )
{
return (&hostInv);
}
/* Preserve a local copy of a pointer to the control struct to
* avoid having to publish a get utility prototype into hwmon.h */
static hwmon_ctrl_type * _hwmon_ctrl_ptr = NULL ;
/* hwmonTimer_audit - get_events periodic audit timer */
static struct mtc_timer hwmonTimer_audit ;
static struct mtc_timer hwmonTimer_token ;
/** List of server profile files */
std::list<string> profile_files ;
std::list<string>::iterator string_iter_ptr ;
/*****************************************************************************
*
* Name : _stage_change
*
* Description: Change the sensor monitor FSM stage.
*
****************************************************************************/
static std::string monitorStages_str[HWMON_SENSOR_MONITOR__STAGES+1];
void _stage_change ( string hostname, monitor_ctrl_stage_enum & nowStage, monitor_ctrl_stage_enum newStage )
{
if ( newStage < HWMON_SENSOR_MONITOR__STAGES )
{
clog ("%s sensor monitor stage change from %s -> %s\n",
hostname.c_str(),
monitorStages_str[nowStage].c_str(),
monitorStages_str[newStage].c_str());
nowStage = newStage ;
}
else
{
slog ("%s sensor monitor stage change to '%d' is invalid ; switching to START\n",
hostname.c_str(),
newStage );
nowStage = HWMON_SENSOR_MONITOR__START ;
}
}
/*******************************************************************
* Module Initialize and Finalizes Interfaces *
*******************************************************************/
/* Initial init of timers. */
/* Not run on a sighup */
void hwmon_timer_init ( void )
{
mtcTimer_init ( hwmonTimer_audit, "controller", "audit timer" ) ;
mtcTimer_init ( hwmonTimer_token, "controller", "token timer") ;
}
/* Register realtime signal handler with the kernel */
int signal_hdlr_init ( int sig_num )
{
int rc ;
UNUSED(sig_num) ;
#ifdef WANT_MORE_SIGNAL_HANDLING
memset (&_pmon_ctrl_ptr->info, 0, sizeof(_pmon_ctrl_ptr->info));
memset (&_pmon_ctrl_ptr->prev, 0, sizeof(_pmon_ctrl_ptr->info));
_pmon_ctrl_ptr->info.sa_sigaction = _process_death_hdlr ;
_pmon_ctrl_ptr->info.sa_flags = (SA_NOCLDSTOP | SA_NOCLDWAIT | SA_SIGINFO) ;
rc = sigaction ( sig_num, &_pmon_ctrl_ptr->info , &_pmon_ctrl_ptr->prev );
if ( rc )
{
elog("Registering Realtime Signal %d - (%d) (%s)\n",
sig_num, errno, strerror(errno));
rc = FAIL_SIGNAL_INIT ;
}
else
{
ilog("Registering Realtime Signal %d\n", sig_num);
}
#else
rc = PASS ;
#endif
return (rc) ;
}
/*
* Init the handler
* - Must support re-init that might occur over a SIGHUP
**/
int hwmon_hdlr_init ( hwmon_ctrl_type * ctrl_ptr )
{
int rc = PASS ;
/* Save the control pointer */
_hwmon_ctrl_ptr = ctrl_ptr ;
monitorStages_str[HWMON_SENSOR_MONITOR__START] = "Start" ;
monitorStages_str[HWMON_SENSOR_MONITOR__DELAY] = "Delay" ;
monitorStages_str[HWMON_SENSOR_MONITOR__READ] = "Read" ;
monitorStages_str[HWMON_SENSOR_MONITOR__PARSE] = "Parse" ;
monitorStages_str[HWMON_SENSOR_MONITOR__CHECK] = "Check" ;
monitorStages_str[HWMON_SENSOR_MONITOR__UPDATE] = "Update";
monitorStages_str[HWMON_SENSOR_MONITOR__HANDLE] = "Handle";
monitorStages_str[HWMON_SENSOR_MONITOR__FAIL] = "Fail" ;
monitorStages_str[HWMON_SENSOR_MONITOR__POWER] = "Power Query" ;
monitorStages_str[HWMON_SENSOR_MONITOR__RESTART] = "Restart" ;
monitorStages_str[HWMON_SENSOR_MONITOR__IDLE] = "Idle" ;
return (rc) ;
}
/* Module Cleanup */
void hwmon_hdlr_fini ( hwmon_ctrl_type * ctrl_ptr )
{
UNUSED(ctrl_ptr) ;
}
/*******************************************************************
* Module Utilities *
******************************************************************/
/* SIGCHLD handler support - for waitpid */
void daemon_sigchld_hdlr ( void )
{
dlog3 ("Received SIGCHLD ...\n");
}
/* Looks up the timer ID and asserts the corresponding ringer */
void hwmonHostClass::timer_handler ( int sig, siginfo_t *si, void *uc)
{
timer_t * tid_ptr = (void**)si->si_value.sival_ptr ;
struct hwmonHostClass::hwmon_host * hwmon_host_ptr ;
/* Avoid compiler errors/warnings for parms we must
* have but currently do nothing with */
UNUSED(sig);
UNUSED(uc);
if ( tid_ptr == NULL )
{
return ;
}
else if ( *tid_ptr == NULL )
{
return ;
}
/* Audit Timer */
else if ( *tid_ptr == hwmonTimer_audit.tid )
{
hwmonTimer_audit.ring = true ;
return ;
}
/* Token refresh Timer */
else if ( *tid_ptr == hwmonTimer_token.tid )
{
mtcTimer_stop_int_safe ( hwmonTimer_token );
hwmonTimer_token.ring = true ;
return ;
}
else
{
hwmon_host_ptr = getHost_timer ( *tid_ptr ) ;
if ( hwmon_host_ptr )
{
if (( *tid_ptr == hwmon_host_ptr->monitor_ctrl.timer.tid ) )
{
mtcTimer_stop_int_safe ( hwmon_host_ptr->monitor_ctrl.timer );
hwmon_host_ptr->monitor_ctrl.timer.ring = true ;
return ;
}
else if (( *tid_ptr == hwmon_host_ptr->bmc_thread_ctrl.timer.tid ) )
{
mtcTimer_stop_int_safe ( hwmon_host_ptr->bmc_thread_ctrl.timer );
hwmon_host_ptr->bmc_thread_ctrl.timer.ring = true ;
return ;
}
else if (( *tid_ptr == hwmon_host_ptr->ping_info.timer.tid ) )
{
mtcTimer_stop_int_safe ( hwmon_host_ptr->ping_info.timer );
hwmon_host_ptr->ping_info.timer.ring = true ;
return ;
}
else if (( *tid_ptr == hwmon_host_ptr->hostTimer.tid ) )
{
mtcTimer_stop_int_safe ( hwmon_host_ptr->hostTimer );
hwmon_host_ptr->hostTimer.ring = true ;
return ;
}
else if (( *tid_ptr == hwmon_host_ptr->addTimer.tid ) )
{
mtcTimer_stop_int_safe ( hwmon_host_ptr->addTimer );
hwmon_host_ptr->addTimer.ring = true ;
return ;
}
else if (( *tid_ptr == hwmon_host_ptr->relearnTimer.tid ) )
{
mtcTimer_stop_int_safe ( hwmon_host_ptr->relearnTimer );
hwmon_host_ptr->relearnTimer.ring = true ;
hwmon_host_ptr->relearn = false ;
return ;
}
else if (( *tid_ptr == hwmon_host_ptr->secretTimer.tid ) )
{
mtcTimer_stop_int_safe ( hwmon_host_ptr->secretTimer );
hwmon_host_ptr->secretTimer.ring = true ;
return ;
}
}
}
mtcTimer_stop_tid_int_safe (tid_ptr);
}
#ifdef WANT_SENSOR_TOGGLE
bool toggle = false ;
#endif
void hwmon_service ( hwmon_ctrl_type * ctrl_ptr )
{
std::list<int> socks ;
struct timeval waitd;
fd_set readfds;
daemon_config_type * config_ptr = daemon_get_cfg_ptr();
hwmon_socket_type * sock_ptr = getSock_ptr();
hostInv.hostBase.my_hostname = ctrl_ptr->my_hostname ;
hostInv.hostBase.my_local_ip = ctrl_ptr->my_local_ip ;
hostInv.hostBase.my_float_ip = ctrl_ptr->my_float_ip ;
if ( config_ptr->token_refresh_rate )
{
if ( config_ptr->token_refresh_rate < 300 )
{
ilog ("Starting 'Token' Refresh timer (%d seconds)\n",
(config_ptr->token_refresh_rate) );
}
else
{
ilog ("Starting 'Token' Refresh timer (%d minutes)\n",
(config_ptr->token_refresh_rate/60) );
}
if ( mtcTimer_start ( hwmonTimer_token,
hwmonTimer_handler,
config_ptr->token_refresh_rate ) != PASS )
{
elog ("Failed to start 'Token' Refresh Timer\n");
daemon_exit ( ) ;
}
}
// client_len = sizeof(client_addr);
socks.clear();
if ( sock_ptr->cmd_sock )
{
socks.push_front (sock_ptr->cmd_sock->getFD());
}
else
{
elog ("cannot service Null cmd_sock\n");
}
socks.sort();
ilog ("Starting 'Audit' timer (%d secs)\n", ctrl_ptr->audit_period );
mtcTimer_start ( hwmonTimer_audit, hwmonTimer_handler, ctrl_ptr->audit_period );
for ( ; ; )
{
/* Initialize the master fd_set */
FD_ZERO(&readfds);
/* add the command receiver socket ro the FD set mask */
if ( sock_ptr->cmd_sock )
{
if ( sock_ptr->cmd_sock->getFD())
{
FD_SET(sock_ptr->cmd_sock->getFD(), &readfds);
}
else
{
/* force a re-init if we have no FD */
sock_ptr->cmd_sock->sock_ok(false);
}
} /* Null sockts are auto recovered below */
waitd.tv_sec = 0;
waitd.tv_usec = (SOCKET_WAIT*3) ;
/* This is used as a delay up to select_timeout */
int rc = select( socks.back()+1, &readfds, NULL, NULL, &waitd);
/* If the select time out expired then */
if (( rc < 0 ) || ( rc == 0 ))
{
/* Check to see if the select call failed. */
/* ... but filter Interrupt signal */
if (( rc < 0 ) && ( errno != EINTR ))
{
elog ( "Select Failed (rc:%d) %s \n", errno, strerror(errno));
}
}
else if ( FD_ISSET(sock_ptr->cmd_sock->getFD(), &readfds))
{
rc = hwmon_service_inbox ();
if ( rc > RETRY )
{
elog ("Failure servicing inbox (rc:%d)\n", rc);
}
}
else
{
wlog ("unexpected select (%d)\n", rc );
}
if ( hwmonTimer_audit.ring == true )
{
mtcTimer_dump_data ();
hostInv.set_degrade_audit();
hwmonTimer_audit.ring = false ;
#ifdef WANT_FIT_TESTING
if ( daemon_want_fit ( FIT_CODE__HWMON__AVOID_TOKEN_REFRESH ))
{
if ( hwmonTimer_token.ring == true )
hwmonTimer_token.ring = false ;
}
#endif
}
/* Handle refreshing the authentication token */
tokenUtil_log_refresh ();
tokenUtil_manage_token ( ctrl_ptr->httpEvent,
ctrl_ptr->my_hostname,
config_ptr->token_refresh_rate,
hwmonTimer_token,
hwmonTimer_handler );
/* Run the FSM */
hostInv.hwmon_fsm ( ) ;
daemon_signal_hdlr ();
daemon_load_fit ( );
}
}
/* Add Host Handler
* ---------------------------*/
int hwmonHostClass::add_host_handler ( struct hwmonHostClass::hwmon_host * host_ptr )
{
switch ( host_ptr->addStage )
{
case HWMON_ADD__WAIT:
{
if ( mtcTimer_expired ( host_ptr->addTimer ))
{
host_ptr->addTimer.ring = false ;
addStageChange ( host_ptr , HWMON_ADD__START );
}
break ;
}
case HWMON_ADD__START:
{
/* force load of sensors from database if sensors = 0 and they exist */
int rc = hwmonHostClass::bmc_load_sensor_model ( host_ptr ) ;
if ( rc == PASS )
{
mtcTimer_start ( host_ptr->addTimer, hwmonTimer_handler, 1);
addStageChange (host_ptr, HWMON_ADD__STATES);
}
else
{
/* there might be issue accessing the sysinv database */
int delay = (rand()%30)+1 ;
wlog ("%s bmc_load_sensor_model failed (rc:%d) ; retrying in %d secs\n", host_ptr->hostname.c_str(), rc , delay);
mtcTimer_start ( host_ptr->addTimer, hwmonTimer_handler, delay );
addStageChange ( host_ptr , HWMON_ADD__WAIT );
}
break ;
}
case HWMON_ADD__STATES:
{
if ( mtcTimer_expired ( host_ptr->addTimer ))
{
if ( host_ptr->sensors )
{
int rc ;
/* manage the alarm and degrade states of all the sensors over process
* startup when the sensor model is already found in the database ;
* typical case over process restart. */
if (( rc = manage_startup_states ( host_ptr ) ) == PASS )
{
/* run the audit right away just to update the host degrade state
* if it needs it ; like over a SWACT */
degrade_state_audit ( host_ptr ) ;
ilog ("%s add complete (groups:%d sensors:%d)\n", host_ptr->hostname.c_str(), host_ptr->groups, host_ptr->sensors );
}
else
{
int delay = (rand()%30)+1 ;
if ( host_ptr->alarmed_config == false )
{
host_ptr->alarmed_config = true ;
hwmonAlarm_minor ( host_ptr->hostname, HWMON_ALARM_ID__SENSORCFG, "profile", REASON_DEGRADED );
}
wlog ("%s manage_startup_states failed (rc:%d) ; retrying in %d secs\n", host_ptr->hostname.c_str(), rc, delay );
mtcTimer_start ( host_ptr->addTimer, hwmonTimer_handler, delay );
break ;
}
}
else
{
ilog ("%s no sensor model ; must be learned ; using %s\n",
host_ptr->hostname.c_str(),
bmcUtil_getProtocol_str(host_ptr->protocol).c_str());
}
addStageChange ( host_ptr , HWMON_ADD__DONE );
}
break ;
}
case HWMON_ADD__DONE:
{
ilog ("%s add complete ; %d sensors %d groups\n", host_ptr->hostname.c_str(), host_ptr->sensors, host_ptr->groups );
break ;
}
default:
{
slog ("%s invalid 'add' stage\n", host_ptr->hostname.c_str() );
if ( host_ptr->addTimer.tid ) mtcTimer_stop ( host_ptr->addTimer );
mtcTimer_start ( host_ptr->addTimer, hwmonTimer_handler, (rand()%10)+1);
addStageChange ( host_ptr , HWMON_ADD__DONE );
break ;
}
}
return (PASS);
}
/* Inventory Object wrapper - does a node lookup and calls the timer handler */
void hwmonTimer_handler ( int sig, siginfo_t *si, void *uc)
{
hwmonHostClass * obj_ptr = get_hwmonHostClass_ptr() ;
obj_ptr->timer_handler ( sig, si, uc );
}
/*****************************************************************************
*
* Name : interval_change_handler
*
* Purpose: : Handles setting the monitoring audit interval.
*
* Description: The following conditions are handled.
*
* if host_ptr->interval is zero then it and all the groups
* are set to the default value.
*
* If there is existing inventory then host_ptr->interval
* is set to the shortest group interval.
*
* With no existing inventory all groups are set to
* HWMON_DEFAULT_AUDIT_INTERVAL
*
* if host_ptr->interval is not zero then all the group intervals
* are set to that value.
*
*****************************************************************************/
int hwmonHostClass::interval_change_handler ( struct hwmonHostClass::hwmon_host * host_ptr )
{
int rc = RETRY ;
dlog ("%s interval change handler\n", host_ptr->hostname.c_str());
/* Don't issue a request if there is one active already */
if ( host_ptr->event.base == NULL )
{
rc = PASS ;
if ( host_ptr->interval < HWMON_MIN_AUDIT_INTERVAL )
{
ilog ("%s setting audit interval\n", host_ptr->hostname.c_str());
if ( host_ptr->groups )
{
int smallest = HWMON_DEFAULT_LARGE_INTERVAL ;
/* get the smallest interval */
for ( int g = 0 ; g < host_ptr->groups ; ++g )
{
if ( smallest > host_ptr->group[g].group_interval )
{
smallest = host_ptr->group[g].group_interval ;
}
}
/* Should be no bigger than the smallest group interval setting. */
host_ptr->interval = smallest ;
}
else
{
/* default first 'learning' audit interval */
host_ptr->interval = 5 ;
}
}
if (( host_ptr->relearn == true ) &&
( host_ptr->model_attributes_preserved.interval != host_ptr->interval ))
{
host_ptr->interval = host_ptr->model_attributes_preserved.interval ;
ilog ("%s audit interval restored to %d seconds\n",
host_ptr->hostname.c_str(),
host_ptr->interval);
}
string interval_string = itos(host_ptr->interval) ;
for ( int g = 0 ; g < host_ptr->groups ; ++g )
{
daemon_signal_hdlr();
if ( host_ptr->interval != host_ptr->group[g].group_interval )
{
/* only updat the group if they differ */
if ( host_ptr->group[g].group_interval != host_ptr->interval )
{
/* update the group interval. Even though bmc
* montoring does not need it, we need to be
* backwards compatible.
*
* bmc monitors all groups at the same interval */
int old = host_ptr->group[g].group_interval ;
host_ptr->group[g].group_interval = host_ptr->interval ;
rc = hwmonHttp_mod_group ( host_ptr->hostname,
host_ptr->event,
host_ptr->group[g].group_uuid,
"audit_interval_group",
interval_string );
if ( rc )
{
elog ("%s failed to update '%s' group audit interval (%d of %d); will retry later\n",
host_ptr->hostname.c_str(),
host_ptr->group[g].group_name.c_str(),
g, host_ptr->groups );
break ;
}
else
{
char str [100] ;
snprintf ( &str[0], 100, "audit interval changed from %d to %d seconds",
old,
host_ptr->group[g].group_interval);
hwmonLog ( host_ptr->hostname,
HWMON_ALARM_ID__SENSORGROUP,
FM_ALARM_SEVERITY_CLEAR,
host_ptr->group[g].group_name, str );
}
}
}
}
/* retry until pass - retries are spaced by audit interval */
if ( rc == PASS )
{
/* TODO: remove error detection and correction */
if ( host_ptr->interval == 0 )
{
slog ("%s failed to set interval correctly\n",host_ptr->hostname.c_str());
host_ptr->interval = HWMON_DEFAULT_AUDIT_INTERVAL ;
}
host_ptr->interval_changed = false ;
}
}
ilog ("%s sensor monitoring period is %d seconds\n",
host_ptr->hostname.c_str(),
host_ptr->interval );
return (rc);
}
/* Hardware Monitor Handler
* --------------------------
*
* TODO: Need grouping to enable the groups in the database
* group_ptr->group_state = "enabled" ;
* hwmonHttp_mod_group ( host_ptr->hostname, host_ptr->event , group_ptr->group_uuid, "state" , group_ptr->group_state );
* if ( group_ptr->group_state.compare("enabled") )
* TODO: Need grouping disabled on state transition from monitoring enabled to disabled
*
*
* */
int hwmonHostClass::bmc_sensor_monitor ( struct hwmonHostClass::hwmon_host * host_ptr )
{
int rc = RETRY ;
if ( host_ptr )
{
/* Check the stage */
if ( host_ptr->monitor_ctrl.stage < HWMON_SENSOR_MONITOR__STAGES )
{
flog ("%s sensor monitor stage (%s)\n",
host_ptr->hostname.c_str(),
monitorStages_str[host_ptr->monitor_ctrl.stage].c_str());
}
else
{
slog ("%s bad sensor monitor state (%d) - forcing into IDLE\n",
host_ptr->hostname.c_str(),
host_ptr->monitor_ctrl.stage);
_stage_change ( host_ptr->hostname,
host_ptr->monitor_ctrl.stage,
HWMON_SENSOR_MONITOR__START );
}
/* check for a new model relearn request */
if ( host_ptr->relearn_request == true )
{
int relearn_time = MTC_MINS_1 ;
/* gracefully handle delete model failure retry.
* if there is a relearn timer running then wait for it
* to expire. This way previously failed relear request
* retries are throttled. */
if ( mtcTimer_expired ( host_ptr->relearnTimer ) == false )
{
/* TODO: test FIT */
return (RETRY);
}
ilog ("%s handling sensor model relearn request\n",
host_ptr->hostname.c_str());
rc = bmc_delete_sensor_model ( host_ptr );
if ( rc != PASS )
{
elog ("%s delete model failure ; retry in %d seconds\n",
host_ptr->hostname.c_str(), relearn_time );
/* If we got an error then wait relearn_time
* before trying again */
mtcTimer_start ( host_ptr->relearnTimer,
hwmonTimer_handler,
relearn_time );
return (RETRY);
}
relearn_time = MTC_MINS_2;
/* enter relearn mode */
host_ptr->relearn = true ;
/* exit relearn request mode.
* allow the relearn operation to proceed */
host_ptr->relearn_request = false ;
this->monitor_soon ( host_ptr );
/* start the relearn timer */
mtcTimer_start ( host_ptr->relearnTimer,
hwmonTimer_handler,
relearn_time );
}
switch ( host_ptr->monitor_ctrl.stage )
{
/******************************************************************
*
* The IDLE stage is the default start and do nothing stage while
* monitoring is disabled.
*
* Stage Transition: external
*
******************************************************************/
case HWMON_SENSOR_MONITOR__IDLE:
{
break ;
}
/******************************************************************
*
* A delayed START
*
*****************************************************************/
case HWMON_SENSOR_MONITOR__RESTART:
{
if ( mtcTimer_expired ( host_ptr->monitor_ctrl.timer ) )
{
_stage_change ( host_ptr->hostname,
host_ptr->monitor_ctrl.stage,
HWMON_SENSOR_MONITOR__START );
}
break ;
}
/******************************************************************
*
* The START stage is the default stage and starts sensor
* monitoring if enabled for this host.
*
* The start process begins with adding a small randomized delay
* before the first READ so that over a process (re)start we don't
* jolt the process by trying to read sensors from all hosts at the
* same time.
*
* Stage Transition:
*
* Success path -> HWMON_SENSOR_MONITOR__DELAY
* Failure Path -> HWMON_SENSOR_MONITOR__IDLE
*
******************************************************************/
case HWMON_SENSOR_MONITOR__START:
{
mtcTimer_reset ( host_ptr->monitor_ctrl.timer );
if ( host_ptr->monitor )
{
/* Handle Audit Interval Change */
if ( host_ptr->interval_changed )
{
interval_change_handler ( host_ptr );
}
/* Handle power state query
* - don't depend on poweron if in relearn mode.
* - otherwise we need to ensure the model is learned
* while the host power is on.
* See comments in HWMON_SENSOR_MONITOR__POWER for details */
if (( host_ptr->sensors == 0 ) &&
( host_ptr->poweron == false ) &&
( host_ptr->relearn == false ))
{
if ( host_ptr->bmc_thread_ctrl.id )
{
wlog ("%s sensor monitor thread is unexpectedly active ; retry soon\n", host_ptr->hostname.c_str());
thread_kill ( host_ptr->bmc_thread_ctrl, host_ptr->bmc_thread_info );
sleep (1);
break ;
}
host_ptr->accounting_bad_count = 0 ;
host_ptr->bmc_thread_ctrl.id = 0 ;
host_ptr->bmc_thread_ctrl.done = false ;
host_ptr->bmc_thread_info.data.clear() ;
host_ptr->bmc_thread_info.status_string.clear();
host_ptr->bmc_thread_info.status = -1 ;
host_ptr->bmc_thread_info.progress = 0 ;
host_ptr->bmc_thread_info.id = 0 ;
host_ptr->bmc_thread_info.signal = 0 ;
host_ptr->bmc_thread_info.command = BMC_THREAD_CMD__POWER_STATUS ;
host_ptr->bmc_thread_info.proto = host_ptr->protocol ;
/* Update / Setup the BMC query credentials */
host_ptr->thread_extra_info.bm_ip = host_ptr->bm_ip ;
host_ptr->thread_extra_info.bm_un = host_ptr->bm_un ;
host_ptr->thread_extra_info.bm_pw = host_ptr->bm_pw ;
rc = thread_launch ( host_ptr->bmc_thread_ctrl,
host_ptr->bmc_thread_info ) ;
if ( rc != PASS )
{
host_ptr->bmc_thread_info.status = rc ;
host_ptr->bmc_thread_info.status_string =
"failed to launch power query thread" ;
_stage_change ( host_ptr->hostname,
host_ptr->monitor_ctrl.stage,
HWMON_SENSOR_MONITOR__FAIL );
}
else
{
/* Assign the extra data pointer */
host_ptr->bmc_thread_info.extra_info_ptr = (void*)&host_ptr->thread_extra_info ;
/* start an umbrella timer 5 seconds longer than
* the default thread FSM timout */
mtcTimer_start ( host_ptr->monitor_ctrl.timer,
hwmonTimer_handler,
(DEFAULT_THREAD_TIMEOUT_SECS+5) );
_stage_change ( host_ptr->hostname,
host_ptr->monitor_ctrl.stage,
HWMON_SENSOR_MONITOR__POWER );
}
break ;
}
else if ( host_ptr->interval )
{
/* Assign the extra data pointer */
host_ptr->bmc_thread_info.extra_info_ptr = (void*)&host_ptr->thread_extra_info ;
/* randomize the first audit a little so that over a swact we don't spike hwmond */
int r = (rand() % MTC_MINS_1) + 1 ;
/* poll all the sensors right away - between 1 and 10 seconds */
ilog ("%s sensor monitoring begins in %d seconds\n",
host_ptr->hostname.c_str(), r );
mtcTimer_start ( host_ptr->monitor_ctrl.timer, hwmonTimer_handler, r );
_stage_change ( host_ptr->hostname,
host_ptr->monitor_ctrl.stage,
HWMON_SENSOR_MONITOR__DELAY );
break ;
}
else
{
host_ptr->interval_changed = true ;
blog ("%s audit interval is zero ; auto correcting\n", host_ptr->hostname.c_str());
break ;
}
}
else
{
ilog ("%s sensor monitoring disabled\n", host_ptr->hostname.c_str());
}
_stage_change ( host_ptr->hostname,
host_ptr->monitor_ctrl.stage,
HWMON_SENSOR_MONITOR__IDLE );
break ;
}
/******************************************************************
*
* The POWER stage handles a power query response.
*
* The START is re-invoked if the power query fails or
* shows that the power is off.
*
* Stage Transition:
*
* Success path -> HWMON_SENSOR_MONITOR__DELAY
* Failure Path -> HWMON_SENSOR_MONITOR__START
*
******************************************************************/
case HWMON_SENSOR_MONITOR__POWER:
{
/* handle thread execution umbrella timeout */
if ( mtcTimer_expired ( host_ptr->monitor_ctrl.timer ) )
{
host_ptr->monitor_ctrl.timer.ring = false ;
wlog ("%s power query thread timeout\n",
host_ptr->hostname.c_str());
thread_kill ( host_ptr->bmc_thread_ctrl, host_ptr->bmc_thread_info );
mtcTimer_start ( host_ptr->monitor_ctrl.timer,
hwmonTimer_handler, MTC_MINS_1 );
_stage_change ( host_ptr->hostname,
host_ptr->monitor_ctrl.stage,
HWMON_SENSOR_MONITOR__RESTART );
}
/* check for 'thread done' completion */
else if ( thread_done( host_ptr->bmc_thread_ctrl ) )
{
/* Consume done results */
mtcTimer_reset ( host_ptr->monitor_ctrl.timer );
if ( host_ptr->bmc_thread_info.status )
{
elog ("%s %s thread %2d failed (rc:%d) (%d:%d)\n",
host_ptr->bmc_thread_ctrl.hostname.c_str(),
host_ptr->bmc_thread_ctrl.name.c_str(),
host_ptr->bmc_thread_info.command,
host_ptr->bmc_thread_info.status,
host_ptr->bmc_thread_info.progress,
host_ptr->bmc_thread_info.runcount);
wlog ("%s ... %s\n",
host_ptr->bmc_thread_ctrl.hostname.c_str(),
host_ptr->bmc_thread_info.status_string.c_str());
}
else
{
dlog ("%s '%s' thread '%d' command is done ; (%d:%d) (rc:%d)\n",
host_ptr->bmc_thread_ctrl.hostname.c_str(),
host_ptr->bmc_thread_ctrl.name.c_str(),
host_ptr->bmc_thread_info.command,
host_ptr->bmc_thread_info.progress,
host_ptr->bmc_thread_info.runcount,
host_ptr->bmc_thread_info.status);
blog2("%s ... status: %s\n",
host_ptr->bmc_thread_ctrl.hostname.c_str(),
host_ptr->bmc_thread_info.status_string.c_str());
#ifdef WANT_FIT_TESTING
if ( daemon_want_fit ( FIT_CODE__HWMON__NO_DATA, host_ptr->hostname ))
{
host_ptr->bmc_thread_info.data.clear ();
host_ptr->bmc_thread_info.status = 0 ;
host_ptr->bmc_thread_info.status_string.clear ();
slog ("%s FIT No Power Status Data\n", host_ptr->hostname.c_str());
}
#endif
if ( host_ptr->bmc_thread_info.data.empty())
{
wlog ("%s power query status empty ; retrying query\n",
host_ptr->hostname.c_str());
}
else if ( host_ptr->bmc_thread_info.data.find (BMC_POWER_ON_STATUS) == string::npos )
{
host_ptr->poweron = false ;
wlog ("%s sensor model learning delayed ; need power on",
host_ptr->hostname.c_str());
}
else
{
/* OK, this is what we have been waiting for */
ilog ("%s power is on", host_ptr->hostname.c_str());
host_ptr->poweron = true ;
}
}
host_ptr->bmc_thread_ctrl.done = true ;
/* Start monitoring in 10 seconds */
int delay = MTC_SECS_10 ;
/* If power is off, retry in 2 minutes ; hold-off period */
if ( host_ptr->poweron == false )
delay = MTC_MINS_2 ;
mtcTimer_start ( host_ptr->monitor_ctrl.timer,
hwmonTimer_handler, delay );
_stage_change ( host_ptr->hostname,
host_ptr->monitor_ctrl.stage,
HWMON_SENSOR_MONITOR__RESTART );
}
break ;
}
/******************************************************************
*
* The DELAY stage inserts time after a failure recovery or
* between successive sensor READ intervals.
*
* The failure path is invoked if the 'thread' stage is not IDLE
* when the DELAY period expires.
*
* Stage Transition:
*
* Success path -> HWMON_SENSOR_MONITOR__READ
* Failure Path -> HWMON_SENSOR_MONITOR__FAIL
*
******************************************************************/
case HWMON_SENSOR_MONITOR__DELAY:
{
if ( mtcTimer_expired ( host_ptr->monitor_ctrl.timer ) )
{
host_ptr->monitor_ctrl.timer.ring = false ;
/* if there was a previous connection failure being handled
* then give it time to resolve */
if ( !thread_idle ( host_ptr->bmc_thread_ctrl ) )
{
wlog ("%s rejecting thread run stage change ; FSM not IDLE (thread stage:%s)\n",
host_ptr->hostname.c_str(),
thread_stage(host_ptr->bmc_thread_ctrl).c_str());
_stage_change ( host_ptr->hostname,
host_ptr->monitor_ctrl.stage,
HWMON_SENSOR_MONITOR__FAIL );
}
else
{
_stage_change ( host_ptr->hostname,
host_ptr->monitor_ctrl.stage,
HWMON_SENSOR_MONITOR__READ );
}
}
/* Handle Audit Interval Change ...
* While we are waiting for the next audit check to see if we have received
* an monitor interval change. If we have then update the database with the
* new data, force this interval to finish and on the next audit the new
* interval will be loaded */
else if ( host_ptr->interval_changed )
{
interval_change_handler ( host_ptr );
/* force this audit interval to expire but don't include this in the
* pass case only. Give sysinv it some time before the next retry */
mtcTimer_stop ( host_ptr->monitor_ctrl.timer );
host_ptr->monitor_ctrl.timer.ring = true ;
}
break ;
}
/******************************************************************
*
* The READ stage requests the launch of the hwmonThread_bmc
* thread that will read the sensor data from the specified host.
*
* An umbrella timeout timer is started on behalf of the PARSE
* stage to detect threadUtil FSM not completing.
*
* Launch will fail if attempted if the thread is already running
* or if the launch request returns a failure.
*
* Stage Transition:
*
* Success path -> HWMON_SENSOR_MONITOR__PARSE
* Failure Path -> HWMON_SENSOR_MONITOR__FAIL
*
******************************************************************/
case HWMON_SENSOR_MONITOR__READ:
{
if ( host_ptr->bmc_thread_ctrl.id )
{
host_ptr->bmc_thread_info.status = FAIL_THREAD_RUNNING ;
host_ptr->bmc_thread_info.status_string =
"sensor monitor thread is unexpectedly active ; handling as failure" ;
_stage_change ( host_ptr->hostname,
host_ptr->monitor_ctrl.stage,
HWMON_SENSOR_MONITOR__FAIL );
break ;
}
host_ptr->accounting_bad_count = 0 ;
host_ptr->bmc_thread_ctrl.id = 0 ;
host_ptr->bmc_thread_ctrl.done = false ;
host_ptr->bmc_thread_info.data.clear() ;
host_ptr->bmc_thread_info.status_string.clear();
host_ptr->bmc_thread_info.status = -1 ;
host_ptr->bmc_thread_info.progress = 0 ;
host_ptr->bmc_thread_info.id = 0 ;
host_ptr->bmc_thread_info.signal = 0 ;
host_ptr->bmc_thread_info.command = BMC_THREAD_CMD__READ_SENSORS ;
host_ptr->bmc_thread_info.proto = host_ptr->protocol ;
/* Update / Setup the BMC query credentials */
host_ptr->thread_extra_info.bm_ip = host_ptr->bm_ip ;
host_ptr->thread_extra_info.bm_un = host_ptr->bm_un ;
host_ptr->thread_extra_info.bm_pw = host_ptr->bm_pw ;
rc = thread_launch ( host_ptr->bmc_thread_ctrl, host_ptr->bmc_thread_info ) ;
if ( rc != PASS )
{
host_ptr->bmc_thread_info.status = rc ;
host_ptr->bmc_thread_info.status_string =
"failed to launch sensor monitoring thread" ;
_stage_change ( host_ptr->hostname,
host_ptr->monitor_ctrl.stage,
HWMON_SENSOR_MONITOR__FAIL );
}
else
{
/* start an umbrella timer 5 seconds longer than
* the default thread FSM timout */
mtcTimer_start ( host_ptr->monitor_ctrl.timer,
hwmonTimer_handler,
(DEFAULT_THREAD_TIMEOUT_SECS+5) );
_stage_change ( host_ptr->hostname,
host_ptr->monitor_ctrl.stage,
HWMON_SENSOR_MONITOR__PARSE );
}
break ;
}
/******************************************************************
* The PARSE stage has 2 main functions
*
* 1. Wait for the bmc command completion from the READ stage
* while monitoring for and handling the unbrella timeout case.
*
* 2. PARSE the sensor data json string into the sample list
*
* sample[MAX_HOST_SENSORS]
*
* The number of sensors read by thread is specified in
*
* thread_extra_info.samples
*
* Failure case is invoked for
* - thread completion umbrella timeout.
* - thread completion error
* - sensor data parse error
*
* Stage Transition:
*
* Success path -> HWMON_SENSOR_MONITOR__CHECK
* Failure Path -> HWMON_SENSOR_MONITOR__FAIL
*
******************************************************************/
case HWMON_SENSOR_MONITOR__PARSE:
{
daemon_signal_hdlr ();
/* Unbrella timeout timer check */
if ( mtcTimer_expired ( host_ptr->monitor_ctrl.timer ) )
{
host_ptr->monitor_ctrl.timer.ring = false ;
host_ptr->bmc_thread_info.status = FAIL_TIMEOUT ;
host_ptr->bmc_thread_info.status_string =
"timeout waiting for sensor read data" ;
_stage_change ( host_ptr->hostname,
host_ptr->monitor_ctrl.stage,
HWMON_SENSOR_MONITOR__FAIL );
}
/* check for 'thread done' completion */
else if ( thread_done( host_ptr->bmc_thread_ctrl ) )
{
/* Consume done results */
mtcTimer_stop ( host_ptr->monitor_ctrl.timer );
if ( host_ptr->bmc_thread_info.status )
{
host_ptr->bmc_thread_ctrl.retries++ ;
if (!(host_ptr->bmc_thread_ctrl.retries>MAX_THREAD_RETRIES))
{
if ( host_ptr->bmc_thread_info.status == FAIL_SYSTEM_CALL )
{
elog ("%s '%s' system call failed (retry %d of %d)",
host_ptr->bmc_thread_info.log_prefix,
bmcUtil_getCmd_str(
host_ptr->bmc_thread_info.command).c_str(),
host_ptr->bmc_thread_ctrl.retries,
MAX_THREAD_RETRIES);
}
else if (( host_ptr->bmc_thread_ctrl.retries == (MAX_THREAD_RETRIES-1)) ||
( host_ptr->bmc_thread_ctrl.retries == 1 ))
{
elog ("%s '%s' failed (rc:%d) (retry %d of %d) (%d:%d)\n",
host_ptr->bmc_thread_info.log_prefix,
bmcUtil_getCmd_str(
host_ptr->bmc_thread_info.command).c_str(),
host_ptr->bmc_thread_info.status,
host_ptr->bmc_thread_ctrl.retries,
MAX_THREAD_RETRIES,
host_ptr->bmc_thread_info.progress,
host_ptr->bmc_thread_info.runcount);
}
/* don't flood the logs with the same error data over and over */
if ( host_ptr->bmc_thread_ctrl.retries == 1 )
{
blog ("%s ... %s\n",
host_ptr->bmc_thread_ctrl.hostname.c_str(),
host_ptr->bmc_thread_info.status_string.c_str());
}
host_ptr->bmc_thread_ctrl.done = true ;
mtcTimer_start ( host_ptr->monitor_ctrl.timer, hwmonTimer_handler, THREAD_RETRY_DELAY_SECS );
_stage_change ( host_ptr->hostname,
host_ptr->monitor_ctrl.stage,
HWMON_SENSOR_MONITOR__DELAY );
break ;
}
}
else
{
dlog ("%s '%s' thread '%d' command is done ; (%d:%d) (rc:%d)\n",
host_ptr->bmc_thread_ctrl.hostname.c_str(),
host_ptr->bmc_thread_ctrl.name.c_str(),
host_ptr->bmc_thread_info.command,
host_ptr->bmc_thread_info.progress,
host_ptr->bmc_thread_info.runcount,
host_ptr->bmc_thread_info.status);
blog2 ("%s ... data: %s\n",
host_ptr->bmc_thread_ctrl.hostname.c_str(),
host_ptr->bmc_thread_info.status_string.c_str());
}
host_ptr->bmc_thread_ctrl.done = true ;
host_ptr->bmc_thread_ctrl.retries = 0 ;
#ifdef WANT_FIT_TESTING
if ( daemon_want_fit ( FIT_CODE__HWMON__NO_DATA, host_ptr->hostname ))
{
host_ptr->bmc_thread_info.data.clear ();
host_ptr->bmc_thread_info.status = 0 ;
host_ptr->bmc_thread_info.status_string.clear ();
}
#endif
if ( host_ptr->bmc_thread_info.status == PASS )
{
/* NOTE: This parsing method is not leaking memory ; verified ! */
json_bool status ;
struct json_object * raw_obj = json_tokener_parse( host_ptr->bmc_thread_info.data.data() );
if ( raw_obj )
{
/* Look for ... BMC_JSON__SENSOR_DATA_MESSAGE_HEADER */
struct json_object * req_obj = (struct json_object *)(NULL) ;
status = json_object_object_get_ex ( raw_obj, BMC_JSON__SENSOR_DATA_MESSAGE_HEADER, &req_obj );
if (( status == true ) && req_obj )
{
char * msg_ptr = (char*)json_object_to_json_string(req_obj) ;
host_ptr->json_bmc_sensors = msg_ptr ;
if ( msg_ptr )
{
host_ptr->bmc_thread_info.status = bmc_load_sensor_samples ( host_ptr , msg_ptr);
if ( host_ptr->bmc_thread_info.status == PASS )
{
if ( host_ptr->samples != host_ptr->sensors )
{
if ( host_ptr->quanta_server == false )
{
blog ("%s read %d sensor samples but expected %d\n",
host_ptr->hostname.c_str(),
host_ptr->samples,
host_ptr->sensors );
}
}
_stage_change ( host_ptr->hostname, host_ptr->monitor_ctrl.stage, HWMON_SENSOR_MONITOR__CHECK );
}
else
{
host_ptr->bmc_thread_info.status_string = "failed to load sensor data" ;
}
}
else
{
host_ptr->bmc_thread_info.status_string = "failed to get json message after header" ;
host_ptr->bmc_thread_info.status = FAIL_JSON_PARSE ;
}
}
else
{
host_ptr->bmc_thread_info.status_string = "failed to find '" ;
host_ptr->bmc_thread_info.status_string.append(BMC_JSON__SENSOR_DATA_MESSAGE_HEADER);
host_ptr->bmc_thread_info.status_string.append("' label") ;
host_ptr->bmc_thread_info.status = FAIL_JSON_PARSE ;
}
}
else
{
host_ptr->bmc_thread_info.status_string = "failed to parse bmc sensor data string" ;
host_ptr->bmc_thread_info.status = FAIL_JSON_PARSE ;
}
if (raw_obj) json_object_put(raw_obj);
}
if ( host_ptr->bmc_thread_info.status )
{
/* Handle thread error status */
if ( host_ptr->groups == 0 )
{
if ( host_ptr->alarmed_config == false )
{
host_ptr->alarmed_config = true ;
hwmonAlarm_minor ( host_ptr->hostname, HWMON_ALARM_ID__SENSORCFG, "profile", REASON_DEGRADED );
}
}
else
{
bmc_set_group_state ( host_ptr, "failed" );
}
_stage_change ( host_ptr->hostname,
host_ptr->monitor_ctrl.stage,
HWMON_SENSOR_MONITOR__FAIL );
}
} /* end handling of done command */
break ;
}
/******************************************************************
*
* The CHECK stage is run on the last parsed sample data loaded
* into the temporary sample sensor data list ...
*
* host_ptr->sample[MAX_HOST_SENSORS]
*
* The number of samples loaded into the sample is
* specified in
*
* host_ptr->samples
*
* The CHECK is intended to identify sensor data corruption or
* model changes that might occur over a BMC firmware upgrade.
*
* The CHECK involves performing a checksum of all the sensor
* names in each list and comparing that checksum to the last
* time the sensors were read.
*
* A stored checksum of zero indicates the first sample read.
* If at that time host_ptr->sensors == 0 then a call to
* bmc_create_sensor_model is made to create a new sensor
* model based on these last sample readings.
*
* If the stored checksums do not match the current checksums
* then that constitutes a sensor mismatch with a design log.
* The mismatch counter is incremented. If the mismatch
* counter exceeds its threshold then the current sensor model
* is deleted and re-created using the new data.
*
* A customer log is created whenever a host's sensor model
* is created or re-created.
*
* Stage Transition:
*
* Success path -> HWMON_SENSOR_MONITOR__UPDATE
* Failure Path -> HWMON_SENSOR_MONITOR__FAIL
*
*********************************************************************/
case HWMON_SENSOR_MONITOR__CHECK:
{
unsigned short temp_checksum ;
daemon_signal_hdlr ();
/* Handle cases where we got an incomplete sensor reading */
if ( host_ptr->thread_extra_info.samples == 0 )
{
if ( host_ptr->bmc_thread_info.status == PASS )
{
host_ptr->bmc_thread_info.status = FAIL_INVALID_DATA ;
host_ptr->bmc_thread_info.status_string = "incomplete sensor data reading" ;
}
_stage_change ( host_ptr->hostname,
host_ptr->monitor_ctrl.stage,
HWMON_SENSOR_MONITOR__FAIL );
break ;
}
/* get the checksum for this sample set */
temp_checksum =
checksum_sample_profile ( host_ptr->hostname,
host_ptr->thread_extra_info.samples,
&host_ptr->sample[0]);
blog1 ("%s samples profile checksum : %04x:%04x (%d:%d:%d)\n",
host_ptr->hostname.c_str(),
temp_checksum,
host_ptr->sample_sensor_checksum,
host_ptr->samples,
host_ptr->sensors,
host_ptr->thread_extra_info.samples);
/* Initialize the sample checksums and counts for the first reading case */
if ( host_ptr->sample_sensor_checksum == 0 )
{
// host_ptr->samples = host_ptr->thread_extra_info.samples ;
host_ptr->sample_sensor_checksum = temp_checksum ;
}
/* look for first sensor reading case with an empty database profile.
* This can occur over a fresh provisioning or a model recreation */
if ( host_ptr->sensors == 0 )
{
ilog ("%s samples profile checksum : %04x (%d sensors) (%d samples)\n",
host_ptr->hostname.c_str(),
host_ptr->sample_sensor_checksum,
host_ptr->sensors,
host_ptr->samples);
/* check the sample model against known Quanta Server profile checksums and sensor numbers */
if (((( host_ptr->sample_sensor_checksum == QUANTA_SAMPLE_PROFILE_CHECKSUM_VER_13_53 ) || ( host_ptr->sample_sensor_checksum == QUANTA_SAMPLE_PROFILE_CHECKSUM_VER_13_50 )) &&
(( host_ptr->samples == QUANTA_SAMPLE_PROFILE_SENSORS_VER_13_53) || (QUANTA_SAMPLE_PROFILE_CHECKSUM_VER_13_50 ))) ||
(( host_ptr->sample_sensor_checksum == QUANTA_SAMPLE_PROFILE_CHECKSUM_VER_13___ )) ||
(( host_ptr->sample_sensor_checksum == QUANTA_SAMPLE_PROFILE_CHECKSUM_VER_13_53b )) ||
(( host_ptr->sample_sensor_checksum == QUANTA_SAMPLE_PROFILE_CHECKSUM_VER_13_47 ) && ( host_ptr->samples == QUANTA_SAMPLE_PROFILE_SENSORS_VER_13_47 )) ||
(( host_ptr->sample_sensor_checksum == QUANTA_SAMPLE_PROFILE_CHECKSUM_VER_13_42 ) && ( host_ptr->samples == QUANTA_SAMPLE_PROFILE_SENSORS_VER_13_42 )) ||
(( host_ptr->sample_sensor_checksum == QUANTA_SAMPLE_PROFILE_CHECKSUM_VER__3_29 ) && ( host_ptr->samples == QUANTA_SAMPLE_PROFILE_SENSORS_VER__3_29 )))
{
/* TODO: can also add search for missing sensors */
ilog ("%s -----------------------------------------------\n", host_ptr->hostname.c_str());
ilog ("%s is a Quanta server based on sensor sample data\n", host_ptr->hostname.c_str());
ilog ("%s -----------------------------------------------\n", host_ptr->hostname.c_str());
host_ptr->quanta_server = true ;
}
/* Create a sensor model from 'this' sample data */
if ( bmc_create_sensor_model ( host_ptr ) != PASS )
{
elog ("%s failed to create sensor model (in sysinv)\n",
host_ptr->hostname.c_str());
}
}
if ( host_ptr->profile_sensor_checksum == 0 )
{
host_ptr->profile_sensor_checksum =
checksum_sensor_profile ( host_ptr->hostname,
host_ptr->sensors,
&host_ptr->sensor[0]);
}
if (( host_ptr->sensors == 0 ) || ( host_ptr->groups == 0 ))
{
elog ("%s has read %d sensors but cannot process with no sensor model (%d:%d)\n",
host_ptr->hostname.c_str(),
host_ptr->thread_extra_info.samples,
host_ptr->sensors,
host_ptr->groups);
_stage_change ( host_ptr->hostname,
host_ptr->monitor_ctrl.stage,
HWMON_SENSOR_MONITOR__START );
}
else
{
blog ("%s has read %d sensors ... processing results\n",
host_ptr->hostname.c_str(), host_ptr->samples);
_stage_change ( host_ptr->hostname,
host_ptr->monitor_ctrl.stage,
HWMON_SENSOR_MONITOR__UPDATE );
}
break ;
}
/******************************************************************
*
* The UPDATE stage translates the string based sensor sample
* data's 'status' to a severity and adds that to the sensors'
* sample_severity member in the sensor list.
*
* host_ptr->sensor[MAX_SENSORS].sample_severity
*
* Stage Transition:
*
* Success path -> HWMON_SENSOR_MONITOR__HANDLE
* Failure Path -> HWMON_SENSOR_MONITOR__FAIL
*
*****************************************************************/
case HWMON_SENSOR_MONITOR__UPDATE:
{
if ( host_ptr->sensor_query_count++ == START_DEBOUCE_COUNT )
{
/* onetime log showing debounce mode started */
ilog ("%s sensor status deboucing enabled\n", host_ptr->hostname.c_str());
}
daemon_signal_hdlr ();
/* handle clearing the config alarm if its raised but we are
* now at a point where the sensors are readable */
if ( host_ptr->alarmed_config == true )
{
host_ptr->alarmed_config = false ;
hwmonAlarm_clear ( host_ptr->hostname, HWMON_ALARM_ID__SENSORCFG, "profile", REASON_OK );
}
if ( bmc_update_sensors ( host_ptr ) == PASS )
{
if ( ( rc = bmc_set_group_state ( host_ptr, "enabled" ) ) == PASS )
{
_stage_change ( host_ptr->hostname,
host_ptr->monitor_ctrl.stage,
HWMON_SENSOR_MONITOR__HANDLE );
}
else
{
elog ("%s failed to set group state to 'enabled' (in sysinv) (rc:%d)\n",
host_ptr->hostname.c_str(), rc);
_stage_change ( host_ptr->hostname,
host_ptr->monitor_ctrl.stage,
HWMON_SENSOR_MONITOR__FAIL );
}
}
else
{
elog ("%s failed to update sensor data (in hwmon) (rc:%d)\n",
host_ptr->hostname.c_str(), rc);
_stage_change ( host_ptr->hostname,
host_ptr->monitor_ctrl.stage,
HWMON_SENSOR_MONITOR__FAIL );
}
break ;
}
case HWMON_SENSOR_MONITOR__HANDLE:
{
/**************************************************************
*
* Loop over all the sensors handling their current severity.
*
* At this point the new severities are in
* sensor_ptr->sample_severity.
*
* After a sensor is serviced in this loop that
* sensor_ptr->sample_severity is copied to ptr->severity
* to be compared against on the next audit interval.
*
*************************************************************/
for ( int i = 0 ; i < host_ptr->sensors ; i++ )
{
/*
* This variable controls whether status change actions
* need to be taken at the end of this loop for sensor
* in context. Assume sensor status is not changed.
*/
bool mod_status = false ;
/* lets use a local pointer to make the code easier to read */
sensor_type * ptr = &host_ptr->sensor[i] ;
/* Local copy of new severity */
sensor_severity_enum severity = ptr->sample_severity ;
/* Things can get a little busy so lets make sure we
* service the signal handler and incoming http requests
* from sysinv.
*/
daemon_signal_hdlr ();
hwmonHttp_server_look ();
/* Internasl error checking ; never seen but just in case.
* Skip over and swerr about null sensor name */
if ( ptr->sensorname.empty() )
{
slog ("%s %d sensor name is empty\n", host_ptr->hostname.c_str(), i );
continue ;
}
if ( ptr->updated == false )
{
host_ptr->accounting_bad_count++ ;
/*
* Force a sensor MINOR if we fail to get status from
* it NOT_FOUND_COUNT_BEFORE_MINOR or more times in a row
*
* This debounces the one of sensor update misses but the
* log above at least shows if/when this is happening.
*/
if ( ++ptr->not_updated_status_change_count >= NOT_FOUND_COUNT_BEFORE_MINOR )
{
severity = HWMON_SEVERITY_MINOR ;
}
}
else
{
ptr->not_updated_status_change_count = 0 ;
}
if ( severity != ptr->severity)
{
ilog ("%s %s status change ; %s:%s -> %s\n",
host_ptr->hostname.c_str(),
ptr->sensorname.c_str(),
get_severity(ptr->severity).c_str(),
ptr->status.c_str(),
get_severity(severity).c_str());
}
blog1 ("%s %s curr:%s this:%s last:%s\n",
host_ptr->hostname.c_str(),
ptr->sensorname.c_str(),
ptr->status.c_str(),
ptr->sample_status.c_str(),
ptr->sample_status_last.c_str());
if ( severity == HWMON_SEVERITY_GOOD )
{
if ( ptr->status.compare("ok") )
{
/* don't bother printing a log for sensors that
* go from offline to ok */
if ( ptr->status != "offline" )
{
ilog ("%s %s is ok (was %s)\n",
host_ptr->hostname.c_str(),
ptr->sensorname.c_str(),
ptr->status.c_str());
}
/* last state was not 'ok' */
mod_status = true ;
ptr->status = "ok" ;
clear_ignored_state (ptr );
clear_logged_state (ptr );
}
/* TODO: verify clearing sensor that has cleared over a process restart */
if ((( ptr->suppress == false ) && ( ptr->severity != HWMON_SEVERITY_GOOD )) ||
((ptr->alarmed == true ) || ( ptr->degraded == true )))
{
hwmonHostClass::manage_sensor_state ( host_ptr->hostname, ptr , HWMON_SEVERITY_GOOD );
}
}
else
{
/* Handle transition from offline to online
* - clear any alarm that exhists for a sensor
* coming out of the offline state is no longer
* offline.
**/
if (( severity != HWMON_SEVERITY_OFFLINE ) && ( !ptr->status.compare("offline") ))
{
wlog ("%s %s sensor returned from '%s' with '%s' severity [alarmed:%s]\n",
host_ptr->hostname.c_str(),
ptr->sensorname.c_str(),
ptr->status.c_str(),
get_severity(severity).c_str(),
ptr->alarmed ? "Yes" : "No");
/* Clear the alarm and allow it to be re-raised if the issue exists */
clear_asserted_alarm ( host_ptr->hostname, HWMON_ALARM_ID__SENSOR, ptr, REASON_ONLINE );
}
if ( severity == HWMON_SEVERITY_OFFLINE )
{
if ( ptr->status.compare("offline"))
{
if ( ptr->alarmed == true )
{
hwmonAlarm_clear ( host_ptr->hostname, HWMON_ALARM_ID__SENSOR, ptr->sensorname, REASON_OFFLINE );
ptr->alarmed = false ;
}
ptr->degraded = false ;
if ( ptr->critl.logged || ptr->major.logged || ptr->minor.logged )
{
hwmonLog_clear ( host_ptr->hostname, HWMON_ALARM_ID__SENSOR, ptr->sensorname, REASON_OFFLINE );
ptr->critl.logged = ptr->major.logged = ptr->minor.logged = false ;
}
mod_status = true ;
blog ("%s %s sensor status change '%s' -> 'offline'\n",
host_ptr->hostname.c_str(),
ptr->status.c_str(),
ptr->sensorname.c_str());
ptr->status = "offline" ;
}
}
else if ( severity == HWMON_SEVERITY_MINOR )
{
/* logs and alarms state changes are handled when the ignore
* action is set in the modify handler so there is no need
* to call the manager in the ignore case */
if (( ptr->suppress == false ) && ( ptr->actions_minor.compare (HWMON_ACTION_IGNORE)))
{
hwmonHostClass::manage_sensor_state ( host_ptr->hostname, ptr, HWMON_SEVERITY_MINOR );
}
else
{
if ( ptr->alarmed == true )
{
/* We may have transitioned to ignore from an alarm state so check and clear if an alarm exists */
clear_asserted_alarm ( host_ptr->hostname, HWMON_ALARM_ID__SENSOR, ptr, REASON_IGNORED );
}
clear_logged_state ( ptr ) ;
}
/* still maintain the status
* ... if not minor then set it to minor */
if ( ptr->status.compare("minor") )
{
ptr->status = "minor" ;
mod_status = true ;
}
}
else if ( severity == HWMON_SEVERITY_MAJOR )
{
/* logs and alarms state changes are handled when the ignore
* action is set in the modify handler so there is no need
* to call the manager in the ignore case */
if (( ptr->suppress == false ) && ( ptr->actions_major.compare (HWMON_ACTION_IGNORE)))
{
hwmonHostClass::manage_sensor_state ( host_ptr->hostname, ptr, HWMON_SEVERITY_MAJOR );
}
else
{
if ( ptr->alarmed == true )
{
/* We may have transitioned to ignore from an alarm state so check and clear if an alarm exists */
clear_asserted_alarm ( host_ptr->hostname, HWMON_ALARM_ID__SENSOR, ptr, REASON_IGNORED );
}
clear_logged_state ( ptr ) ;
}
/* if not major then set it to major */
if ( ptr->status.compare("major") )
{
ptr->status = "major" ;
mod_status = true ;
}
}
else if (( severity == HWMON_SEVERITY_CRITICAL ) ||
( severity == HWMON_SEVERITY_NONRECOVERABLE ))
{
/* log and alarm state changes are handled when the ignore
* action is set in the modify handler so there is no need
* to call the manager in the ignore case */
if (( ptr->suppress == false ) && ( ptr->actions_critl.compare (HWMON_ACTION_IGNORE)))
{
if ( !ptr->actions_critl.compare (HWMON_ACTION_RESET))
{
if ( host_ptr->monitor == false )
{
/* Ignore event while we are not monitoring */
ilog ("%s %s ignoring 'reset action' while not monitoring\n",
host_ptr->hostname.c_str(),
ptr->sensorname.c_str());
}
else
{
if ( ptr->critl.alarmed == false )
{
hwmonAlarm_critical ( host_ptr->hostname, HWMON_ALARM_ID__SENSOR,
ptr->sensorname, REASON_RESETTING ) ;
}
clear_alarmed_state ( ptr );
set_alarmed_severity ( ptr, FM_ALARM_SEVERITY_CRITICAL );
if ( ptr->degraded == false )
{
ptr->degraded = true ;
}
clear_ignored_state ( ptr );
clear_logged_state ( ptr );
/* Send reset request to mtcAgent */
wlog ("%s requesting 'reset' due to critical '%s' sensor\n",
host_ptr->hostname.c_str(),
ptr->sensorname.c_str());
hwmon_send_event ( host_ptr->hostname,
MTC_EVENT_HWMON_RESET,
ptr->sensorname.data());
}
}
else if ( !ptr->actions_critl.compare (HWMON_ACTION_POWERCYCLE))
{
if ( host_ptr->monitor == false )
{
/* Ignore event while we are not monitoring */
ilog ("%s %s ignoring 'power-cycle action' while not monitoring\n",
host_ptr->hostname.c_str(),
ptr->sensorname.c_str());
}
else
{
if ( ptr->critl.alarmed == false )
{
hwmonAlarm_critical ( host_ptr->hostname, HWMON_ALARM_ID__SENSOR,
ptr->sensorname, REASON_POWERCYCLING ) ;
}
clear_alarmed_state ( ptr );
set_alarmed_severity ( ptr, FM_ALARM_SEVERITY_CRITICAL );
if ( ptr->degraded == false )
{
ptr->degraded = true ;
}
clear_ignored_state ( ptr );
clear_logged_state ( ptr );
wlog ("%s requesting 'powercycle' due to critical '%s' sensor\n",
host_ptr->hostname.c_str(),
ptr->sensorname.c_str());
/* Send reset request to mtcAgent */
hwmon_send_event ( host_ptr->hostname,
MTC_EVENT_HWMON_POWERCYCLE,
ptr->sensorname.data());
}
}
else
{
hwmonHostClass::manage_sensor_state ( host_ptr->hostname, ptr, HWMON_SEVERITY_CRITICAL );
}
}
else
{
if ( ptr->alarmed == true )
{
/* We may have transitioned to ignore from an alarm state so check and clear if an alarm exists */
clear_asserted_alarm ( host_ptr->hostname, HWMON_ALARM_ID__SENSOR, ptr, REASON_IGNORED );
}
else
{
blog2 ("%s %s is not alarmed\n", host_ptr->hostname.c_str(), ptr->sensorname.c_str() );
}
clear_logged_state ( ptr ) ;
}
/* if not critical then set it to critical */
if ( ptr->status.compare("critical") )
{
ptr->status = "critical" ;
mod_status = true ;
}
}
else
{
slog ("%s unknown severity (%d)\n", host_ptr->hostname.c_str(), severity );
}
} /* end else that look at non-good severities */
if ( mod_status == true )
{
hwmonHttp_mod_sensor ( host_ptr->hostname, host_ptr->event , ptr->uuid, "status" , ptr->status );
}
ptr->severity = severity ;
} /* end for loop over all sensors */
if ( host_ptr->bmc_fw_version.empty() )
{
string fn = (IPMITOOL_OUTPUT_DIR + host_ptr->hostname + "_mc_info") ;
if ( daemon_is_file_present ( fn.data() ) )
{
host_ptr->bmc_fw_version =
get_bmc_version_string ( host_ptr->hostname,
fn.data() );
}
if ( !host_ptr->bmc_fw_version.empty() )
{
ilog ("%s bmc fw version: %s\n",
host_ptr->hostname.c_str(),
host_ptr->bmc_fw_version.c_str());
}
}
/* Start the next group interval timer */
if ( host_ptr->interval < HWMON_MIN_AUDIT_INTERVAL )
{
ilog ("%s monitor interval set to a %d secs cadence (%d)\n",
host_ptr->hostname.c_str(),
HWMON_DEFAULT_AUDIT_INTERVAL,
host_ptr->interval);
host_ptr->interval = HWMON_DEFAULT_AUDIT_INTERVAL ;
interval_change_handler ( host_ptr );
}
/* exit sensor model relearn mode if we have sensors and groups */
if (( host_ptr->relearn == true ) &&
( host_ptr->sensors ) && ( host_ptr->groups ))
{
mtcTimer_reset ( host_ptr->relearnTimer );
host_ptr->relearn = false ;
plog ("%s sensor model relearn complete\n",
host_ptr->hostname.c_str());
}
mtcTimer_start ( host_ptr->monitor_ctrl.timer,
hwmonTimer_handler,
host_ptr->interval );
_stage_change ( host_ptr->hostname,
host_ptr->monitor_ctrl.stage,
HWMON_SENSOR_MONITOR__DELAY );
break ;
}
case HWMON_SENSOR_MONITOR__FAIL:
{
host_ptr->ping_info.ok = false ;
host_ptr->bmc_thread_ctrl.retries = 0 ;
mtcTimer_reset ( host_ptr->monitor_ctrl.timer );
if ( host_ptr->bmc_thread_info.status )
{
elog ("%s sensor monitoring failure (rc:%d)\n",
host_ptr->hostname.c_str(),
host_ptr->bmc_thread_info.status );
if ( host_ptr->bmc_thread_info.data.length() )
{
string _temp = host_ptr->bmc_thread_info.status_string ;
size_t pos = _temp.find ("-f", 0) ;
if ( pos != std::string::npos )
{
/* don't log the password filename */
elog ("%s ... %s\n",
host_ptr->hostname.c_str(),
_temp.substr(0,pos).c_str());
}
else
{
elog ("%s ... %s\n",
host_ptr->hostname.c_str(),
host_ptr->bmc_thread_info.status_string.c_str());
}
}
}
if ( host_ptr->bmc_thread_ctrl.id )
{
slog ("%s sensor monitor thread is unexpectedly active ; handling as failure\n",
host_ptr->hostname.c_str());
thread_kill ( host_ptr->bmc_thread_ctrl, host_ptr->bmc_thread_info );
}
if ( host_ptr->interval )
{
bmc_set_group_state ( host_ptr, "failed" ) ;
_stage_change ( host_ptr->hostname,
host_ptr->monitor_ctrl.stage,
HWMON_SENSOR_MONITOR__START );
}
else
{
/* TODO: Error case that should not happen ; need to force reprovision */
_stage_change ( host_ptr->hostname,
host_ptr->monitor_ctrl.stage,
HWMON_SENSOR_MONITOR__IDLE );
}
break ;
}
case HWMON_SENSOR_MONITOR__STAGES:
default:
{
slog ("%s Invalid stage (%d)\n",
host_ptr->hostname.c_str(),
host_ptr->monitor_ctrl.stage );
_stage_change ( host_ptr->hostname,
host_ptr->monitor_ctrl.stage,
HWMON_SENSOR_MONITOR__START );
}
}
}
return (rc);
}
/* Delete Handler
* ----------------- */
int hwmonHostClass::delete_handler ( struct hwmonHostClass::hwmon_host * host_ptr )
{
if ( host_ptr == NULL )
{
slog ("delete handler called with null pointer\n");
return (FAIL_NULL_POINTER);
}
switch ( host_ptr->delStage )
{
case HWMON_DEL__START:
{
ilog ("%s Delete Operation Started\n", host_ptr->hostname.c_str());
host_ptr->retries = 0 ;
if ( host_ptr->bmc_thread_ctrl.stage != THREAD_STAGE__IDLE )
{
int delay = THREAD_POST_KILL_WAIT ;
thread_kill ( host_ptr->bmc_thread_ctrl , host_ptr->bmc_thread_info) ;
ilog ("%s thread active ; sending kill ; waiting %d seconds\n",
host_ptr->hostname.c_str(), delay );
mtcTimer_reset ( host_ptr->hostTimer );
mtcTimer_start ( host_ptr->hostTimer, hwmonTimer_handler, delay );
host_ptr->delStage = HWMON_DEL__WAIT ;
}
else
{
host_ptr->delStage = HWMON_DEL__DONE ;
}
break ;
}
case HWMON_DEL__WAIT:
{
if ( mtcTimer_expired ( host_ptr->hostTimer ) )
{
if ( host_ptr->bmc_thread_ctrl.stage != THREAD_STAGE__IDLE )
{
if ( host_ptr->retries++ < 3 )
{
wlog ("%s still waiting on active thread ; sending another kill signal (try %d or %d)\n",
host_ptr->hostname.c_str(), host_ptr->retries, 3 );
thread_kill ( host_ptr->bmc_thread_ctrl, host_ptr->bmc_thread_info ) ;
mtcTimer_start ( host_ptr->hostTimer, hwmonTimer_handler, THREAD_POST_KILL_WAIT );
break ;
}
else
{
elog ("%s thread refuses to stop ; giving up ...\n",
host_ptr->hostname.c_str());
}
}
host_ptr->delStage = HWMON_DEL__DONE ;
}
break ;
}
case HWMON_DEL__DONE:
{
/* ok now delete the host */
del_host ( host_ptr->hostname );
this->host_deleted = true ;
break ;
}
default:
{
ilog ("%s invalid delete stage (%d) ; correcting ...\n", host_ptr->hostname.c_str(), host_ptr->delStage );
host_ptr->delStage = HWMON_DEL__START ;
}
}
return (PASS);
}
/*****************************************************************************
*
* Name : manage_startup_states
*
* Description: Manage the sensor startup states.
*
* This means failure log, alarm and degraded states on
* startup for groups and sensors
*
*****************************************************************************/
bool hwmonHostClass::manage_startup_states ( struct hwmonHostClass::hwmon_host * host_ptr )
{
int rc = PASS ;
if ( host_ptr )
{
std::list<hwmonAlarm_entity_status_type>::iterator _iter_ptr ;
std::list<hwmonAlarm_entity_status_type> alarm_list ;
alarm_list.clear();
/********************** Manage Profile Alarms ***********************/
/* clear this config alarm as it is not used anymore - handles patchback case.
* Its cheaper to send a clear than it is to query for it first */
hwmonAlarm_clear ( host_ptr->hostname, HWMON_ALARM_ID__SENSORCFG, "sensor", REASON_OK );
#ifdef WANT_QUERY_SENSOR_CONFIG_ALARM
/* We don't degrade for sensor config error - this is similar to a
* BMC access error in mtcAgent where we only raise a minor alarm */
if ( hwmon_alarm_query ( host_ptr->hostname, HWMON_ALARM_ID__SENSORCFG, "profile" ) != FM_ALARM_SEVERITY_CLEAR )
host_ptr->alarmed_config = true ;
#endif
if ( host_ptr->alarmed_config == false )
{
hwmonAlarm_clear ( host_ptr->hostname, HWMON_ALARM_ID__SENSORCFG, "profile", REASON_OK );
host_ptr->alarmed_config = false ;
}
/********************** Manage Group Alarms ***********************/
string entity = "host=" + host_ptr->hostname + ".sensorgroup=" ;
/* 1. Query for all group alarms */
rc = hwmonAlarm_query_entity ( host_ptr->hostname, entity, alarm_list );
if ( rc != PASS )
{
elog ("%s sensorgroup alarm query failed\n", host_ptr->hostname.c_str() );
return (FAIL_OPERATION);
}
/* 2. Search the alarm list for orphan groups
* - group alarms that are not in the current group list
* - should not occur but is a catch all for stuck group alarms */
for ( _iter_ptr = alarm_list.begin(); _iter_ptr != alarm_list.end(); ++_iter_ptr )
{
bool found = false ;
for ( int g = 0 ; g < host_ptr->groups ; g++ )
{
string _temp = entity + host_ptr->group[g].group_name ;
if ( _iter_ptr->instance.compare(_temp) == 0 )
{
found = true ;
break ;
}
}
if ( found == false )
{
string groupname = _iter_ptr->instance.substr (entity.length()) ;
wlog ("%s found orphan group alarm '%s' ; clearing\n", host_ptr->hostname.c_str(), groupname.c_str() );
hwmonAlarm_clear ( host_ptr->hostname, HWMON_ALARM_ID__SENSORGROUP, groupname, REASON_DEPROVISIONED );
}
}
/* 3. Look up each alarmed group and then manage that alarm */
for ( int g = 0 ; g < host_ptr->groups ; g++ )
{
struct sensor_group_type * group_ptr = &host_ptr->group[g] ;
bool found = false ;
bool raise = false ;
bool clear = false ;
daemon_signal_hdlr ();
if ( alarm_list.size() )
{
for ( _iter_ptr = alarm_list.begin(); _iter_ptr != alarm_list.end(); ++_iter_ptr )
{
string _temp = entity + group_ptr->group_name ;
if ( _iter_ptr->instance.compare(_temp) == 0 )
{
ilog ("%s '%s' group '%s' alarm already set\n",
host_ptr->hostname.c_str(),
host_ptr->group[g].group_name.c_str(),
alarmUtil_getSev_str(_iter_ptr->severity).c_str());
found = true ;
break ;
}
}
}
/* Note: if found == true then the group_ptr points to the group that
* has the alarm raised and _iter_ptr point to the alarm info */
/* Determine if this alarm needs to be raised or cleared ... or left alone
* Database state takes precidence of all */
if ( group_ptr->group_state.compare("failed") == 0 )
{
group_ptr->failed = true ;
group_ptr->alarmed = true ;
if ( found == true )
{
if ( _iter_ptr->severity != FM_ALARM_SEVERITY_MAJOR )
{
slog ("%s %s group alarm severity incorrect (%d:%s) ; correcting \n",
host_ptr->hostname.c_str(),
_iter_ptr->entity.c_str(),
_iter_ptr->severity,
alarmUtil_getSev_str(_iter_ptr->severity).c_str());
raise = true ;
}
}
else
{
raise = true ;
}
}
else
{
group_ptr->failed = false ;
group_ptr->alarmed = false ;
if ( found == true )
{
clear = true ;
}
}
if ( raise == true )
{
group_ptr->failed = true ;
group_ptr->alarmed = true ;
hwmonAlarm_major ( host_ptr->hostname, HWMON_ALARM_ID__SENSORGROUP, group_ptr->group_name, REASON_DEGRADED );
}
if ( clear == true )
{
group_ptr->failed = false ;
group_ptr->alarmed = false ;
hwmonAlarm_clear ( host_ptr->hostname, HWMON_ALARM_ID__SENSORGROUP, group_ptr->group_name, REASON_OK );
}
}
/********************** Manage Sensor Alarms ***********************/
/* 1. Query Sensor Alarm States from FM */
entity = "host=" + host_ptr->hostname + ".sensor=" ;
rc = hwmonAlarm_query_entity ( host_ptr->hostname, entity, alarm_list );
if ( rc != PASS )
{
elog ("%s sensor alarm query failed\n", host_ptr->hostname.c_str() );
return (FAIL_OPERATION);
}
/* 2. Search the alarm list for orphan sensors
* - sensor alarms that are not in the current sensor list
* - should not occur but is a catch all for stuck sensor alarms */
for ( _iter_ptr = alarm_list.begin (); _iter_ptr != alarm_list.end () ; ++_iter_ptr )
{
bool found = false ;
for ( int s = 0 ; s < host_ptr->sensors ; s++ )
{
string _temp = entity + host_ptr->sensor[s].sensorname ;
if ( _iter_ptr->instance.compare(_temp) == 0 )
{
ilog ("%s '%s' sensor '%s' alarm already set\n",
host_ptr->hostname.c_str(),
host_ptr->sensor[s].sensorname.c_str(),
alarmUtil_getSev_str(_iter_ptr->severity).c_str());
found = true ;
break ;
}
}
if ( found == false )
{
string sensorname = _iter_ptr->instance.substr (entity.length()) ;
wlog ("%s found orphan sensor alarm '%s' ; clearing\n", host_ptr->hostname.c_str(), sensorname.c_str() );
hwmonAlarm_clear ( host_ptr->hostname, HWMON_ALARM_ID__SENSOR, sensorname, REASON_DEPROVISIONED );
}
}
/* 3. manage the state of sensors alarms */
for ( int s = 0 ; s < host_ptr->sensors ; s++ )
{
std::list<hwmonAlarm_entity_status_type>::iterator _iter_ptr ;
sensor_type * sensor_ptr = &host_ptr->sensor[s] ;
string reason = REASON_OK ;
bool found = false ;
bool clear = false ;
bool minor = false ;
bool major = false ;
bool critl = false ;
daemon_signal_hdlr ();
if ( alarm_list.size() )
{
for ( _iter_ptr = alarm_list.begin () ;
_iter_ptr != alarm_list.end () ;
++_iter_ptr )
{
string _temp = entity + sensor_ptr->sensorname ;
if ( _iter_ptr->instance.compare(_temp) == 0 )
{
found = true ;
break ;
}
}
}
/* Note: if found == true then the sensor_ptr points to the sensor that
* has the alarm raised and _iter_ptr point to the alarm info */
/* Determine if this alarm needs to be raised or cleared ... or left alone
* Database state takes precidence of all */
if ( sensor_ptr->status.compare("ok") == 0 )
{
clear_alarmed_state ( sensor_ptr );
clear_degraded_state ( sensor_ptr );
if ( found == true )
{
clear = true ;
}
}
else if ( sensor_ptr->status.compare("offline") == 0 )
{
clear_alarmed_state ( sensor_ptr );
clear_degraded_state ( sensor_ptr );
if ( found == true )
{
clear = true ;
}
}
else if ( sensor_ptr->status.compare("minor") == 0 )
{
if ( sensor_ptr->actions_minor.compare("alarm"))
{
if ( found == true )
{
clear = true ;
}
if ( sensor_ptr->actions_minor.compare("log") == 0 )
{
set_logged_severity ( sensor_ptr, FM_ALARM_SEVERITY_MINOR );
reason = REASON_SET_TO_LOG ;
}
if ( sensor_ptr->actions_major.compare("ignore") == 0 )
{
set_ignored_severity ( sensor_ptr, FM_ALARM_SEVERITY_MINOR );
reason = REASON_IGNORED ;
}
}
else if ( sensor_ptr->suppress == true )
{
if ( found == true )
{
reason = REASON_SUPPRESSED ;
clear = true ;
}
}
/**
* else this is an alarm case ...
* - if no alarm found then raise the minor alarm
* - if alarm found but not in proper severity then
* raise the minor alarm
**/
else
{
set_alarmed_severity ( sensor_ptr , FM_ALARM_SEVERITY_MINOR );
clear_degraded_state ( sensor_ptr );
if (( found == false ) ||
(( found == true ) && ( _iter_ptr->severity != FM_ALARM_SEVERITY_MINOR )))
{
/* correct the severity of the alarm */
minor = true ;
}
}
}
else if ( sensor_ptr->status.compare("major") == 0 )
{
if ( sensor_ptr->actions_major.compare("alarm"))
{
if ( found == true )
{
clear = true ;
}
if ( sensor_ptr->actions_major.compare("log") == 0 )
{
set_logged_severity ( sensor_ptr, FM_ALARM_SEVERITY_MAJOR );
reason = REASON_SET_TO_LOG ;
}
if ( sensor_ptr->actions_major.compare("ignore") == 0 )
{
set_ignored_severity ( sensor_ptr, FM_ALARM_SEVERITY_MAJOR ) ;
reason = REASON_IGNORED ;
}
}
else if ( sensor_ptr->suppress == true )
{
if ( found == true )
{
reason = REASON_SUPPRESSED ;
clear = true ;
}
}
/**
* else this is an alarm case ...
* - if no alarm found then raise the major alarm
* - if alarm found but not in proper severity then
* raise the major alarm
**/
else
{
set_alarmed_severity ( sensor_ptr , FM_ALARM_SEVERITY_MAJOR );
set_degraded_state ( sensor_ptr );
if (( found == false ) ||
(( found == true ) && ( _iter_ptr->severity != FM_ALARM_SEVERITY_MAJOR )))
{
/* correct the severity of the alarm */
major = true ;
}
}
}
else if ( sensor_ptr->status.compare("critical") == 0 )
{
if ( sensor_ptr->actions_critl.compare("alarm"))
{
if ( found == true )
{
clear = true ;
}
if ( sensor_ptr->actions_critl.compare("log") == 0 )
{
set_logged_severity ( sensor_ptr, FM_ALARM_SEVERITY_CRITICAL ) ;
reason = REASON_SET_TO_LOG ;
}
if ( sensor_ptr->actions_critl.compare("ignore") == 0 )
{
set_ignored_severity ( sensor_ptr , FM_ALARM_SEVERITY_CRITICAL ) ;
reason = REASON_IGNORED ;
}
}
else if ( sensor_ptr->suppress == true )
{
if ( found == true )
{
reason = REASON_SUPPRESSED ;
clear = true ;
}
}
/**
* else this is an alarm case ...
* - if no alarm found then raise the critical alarm
* - if alarm found but not in proper severity then
* raise the critical alarm
**/
else
{
set_alarmed_severity ( sensor_ptr , FM_ALARM_SEVERITY_CRITICAL );
set_degraded_state ( sensor_ptr );
if (( found == false ) ||
(( found == true ) && ( _iter_ptr->severity != FM_ALARM_SEVERITY_CRITICAL )))
{
/* correct the severity of the alarm */
critl = true ;
}
}
}
if ( clear == true )
{
clear_alarmed_state ( sensor_ptr );
clear_degraded_state ( sensor_ptr );
hwmonAlarm_clear ( host_ptr->hostname, HWMON_ALARM_ID__SENSOR, sensor_ptr->sensorname, reason );
}
else if ( minor == true )
{
clear_degraded_state ( sensor_ptr );
set_alarmed_severity ( sensor_ptr, FM_ALARM_SEVERITY_MINOR );
hwmonAlarm_minor ( host_ptr->hostname, HWMON_ALARM_ID__SENSOR, sensor_ptr->sensorname, REASON_DEGRADED );
}
else if ( major == true )
{
set_degraded_state ( sensor_ptr );
set_alarmed_severity ( sensor_ptr, FM_ALARM_SEVERITY_MAJOR );
hwmonAlarm_major ( host_ptr->hostname, HWMON_ALARM_ID__SENSOR, sensor_ptr->sensorname, REASON_DEGRADED );
}
else if ( critl == true )
{
set_degraded_state ( sensor_ptr );
set_alarmed_severity ( sensor_ptr, FM_ALARM_SEVERITY_CRITICAL);
hwmonAlarm_critical ( host_ptr->hostname, HWMON_ALARM_ID__SENSOR, sensor_ptr->sensorname, REASON_DEGRADED );
}
// sensorState_print ( host_ptr->hostname, sensor_ptr );
}
}
else
{
rc = FAIL_NULL_POINTER ;
}
return (rc);
}
/*****************************************************************************
*
* Name : monitor_now
*
* Description: Force monitor to occur immediately.
*
****************************************************************************/
void hwmonHostClass::monitor_now ( struct hwmonHostClass::hwmon_host * host_ptr )
{
if ( host_ptr )
{
if ( host_ptr->monitor_ctrl.stage == HWMON_SENSOR_MONITOR__DELAY )
{
mtcTimer_reset ( host_ptr->monitor_ctrl.timer );
host_ptr->monitor_ctrl.timer.ring = true ;
dlog ("%s force monitor now\n", host_ptr->hostname.c_str() );
}
}
else
{
slog ("null host pointer\n");
}
}
/*****************************************************************************
*
* Name : monitor_soon
*
* Description: Force monitor to occur in 5 seconds.
*
****************************************************************************/
void hwmonHostClass::monitor_soon ( struct hwmonHostClass::hwmon_host * host_ptr )
{
if ( host_ptr )
{
int delay = MTC_SECS_5 ;
wlog ("%s sensor monitoring FSM stage (%d) aborted\n",
host_ptr->hostname.c_str(),
host_ptr->monitor_ctrl.stage);
if ( host_ptr->bmc_thread_ctrl.id )
{
ilog ("%s stopping current thread (%lu)\n", host_ptr->hostname.c_str(), host_ptr->bmc_thread_ctrl.id );
thread_kill ( host_ptr->bmc_thread_ctrl, host_ptr->bmc_thread_info );
/* have to wait a bit longer than THREAD_POST_KILL_WAIT for the thread kill to happen */
delay += THREAD_POST_KILL_WAIT ;
}
_stage_change ( host_ptr->hostname,
host_ptr->monitor_ctrl.stage,
HWMON_SENSOR_MONITOR__DELAY) ;
mtcTimer_reset ( host_ptr->monitor_ctrl.timer );
mtcTimer_start ( host_ptr->monitor_ctrl.timer,
hwmonTimer_handler, delay );
ilog ("%s sensor monitoring will resume in %d seconds\n",
host_ptr->hostname.c_str(), delay );
}
else
{
slog ("null host pointer\n");
}
}