Add alarm retry support to maintenance alarm handling daemon

The maintenance alarm handling daemon (mtcalarmd) should not
drop alarm requests simply because FM process is not running.
Insteads it should retry for it and other FM error cases that
will likely succeed in time if they are retried.

Some error cases however do need to be dropped such as those
that are unlikely to succeed with retries.

Reviewed FM return codes with FM designer which lead to a list
of errors that should drop and others that should retry.

This update implements that handling with a posting and
servicing of a first-in / first-out alarm queue.

Typical retry case is the NOCONNECT error code which occurs
when FM is not running.

Alarm ordering and first try timestamp is maintained.
Retries and logs are throttled to avoid flooding.

Test Plan:

PASS: Verify success path alarm handling End-to-End.
PASS: Verify retry handling while FM is not running.
PASS: Verify handling of all FM error codes (fit tool).
PASS: Verify alarm handling under stress (inject-alarm script) soak.
PASS: verify no memory leak over stress soak.
PASS: Verify logging (success, retry, failure)
PASS: Verify alarm posted date is maintained over retry success.

Change-Id: Icd1e75583ef660b767e0788dd4af7f184bdb9e86
Closes-Bug: 1841653
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
Eric MacDonald 2019-09-27 11:04:33 -04:00
parent 901e81a191
commit f2fedc0446
8 changed files with 376 additions and 118 deletions

View File

@ -67,6 +67,7 @@
#define MTC_CMD_FIT__JSON_LEAK_SOAK ("/var/run/fit/json_leak_soak") /* mtcAgent */
#define MTC_CMD_FIT__BMC_ACC_FAIL ("/var/run/fit/bmc_access_fail")/* mtcAgent */
#define MTC_CMD_FIT__MEM_LEAK_DEBUG ("/var/run/fit/mem_leak_debug")/* mtcAgent */
#define MTC_CMD_FIT__FM_ERROR_CODE ("/var/run/fit/fm_error_code") /* mtcAgent */
/*****************************************************
* Fault Insertion Codes
@ -120,7 +121,8 @@
#define FIT_CODE__FM_SET_ALARM (40)
#define FIT_CODE__FM_GET_ALARM (41)
#define FIT_CODE__FM_QRY_ALARMS (42)
#define FIT_CODE__FM_CLR_ALARM (42)
#define FIT_CODE__FM_QRY_ALARMS (43)
#define FIT_CODE__BMC_COMMAND_SEND (60)
#define FIT_CODE__BMC_COMMAND_RECV (61)

View File

@ -371,6 +371,8 @@ typedef enum
/* 50 milliseconds */
#define SOCKET_WAIT 50000
#define SOCKET_WAIT_100MS (100000)
/* 5 milliseconds */
#define MTCAGENT_SELECT_TIMEOUT (5000)

View File

@ -23,7 +23,6 @@
#define ENTITY_PREFIX ((const char *)"host=")
#define MAX_ALARMS (10)
#define MAX_ALARM_REQ_PER_MSG (4)
#define MAX_ALARM_REQ_MSG_SIZE (500)
#define MAX_ALARM_REQ_SIZE (MAX_ALARM_REQ_PER_MSG*MAX_ALARM_REQ_MSG_SIZE)
@ -97,6 +96,17 @@ typedef struct
string clear_reason ;
} alarmUtil_type ;
typedef struct
{
string alarmid ;
string hostname ;
string operation ;
string severity ;
string entity ;
string prefix ;
FMTimeT timestamp ;
} queue_entry_type;
#define MAX_FAILED_B2B_RECEIVES_B4_RESTART (5)
@ -130,13 +140,9 @@ alarmUtil_type * alarmData_getAlarm_ptr ( string alarm_id_str );
/* in alarmHdlr.cpp */
int alarmHdlr_request_handler ( char * msg_ptr );
/* in alarmMgr.cpp */
int alarmMgr_manage_alarm ( string alarmid ,
string hostname,
string operation,
string severity,
string entity,
string prefix);
void alarmMgr_queue_clear ( void );
void alarmMgr_queue_alarm (queue_entry_type entry);
void alarmMgr_service_queue(void);
/* Clear all alarms against this host */
void alarmUtil_clear_all ( string hostname );
@ -154,14 +160,14 @@ int alarmUtil_query_identity ( string identity,
unsigned int alarms_max );
int alarmUtil_clear ( string hostname, string alarm_id, string entity );
int alarmUtil_critical ( string hostname, string alarm_id, string entity );
int alarmUtil_major ( string hostname, string alarm_id, string entity );
int alarmUtil_minor ( string hostname, string alarm_id, string entity );
int alarmUtil_warning ( string hostname, string alarm_id, string entity );
int alarmUtil_critical_log ( string hostname, string alarm_id, string entity );
int alarmUtil_major_log ( string hostname, string alarm_id, string entity );
int alarmUtil_minor_log ( string hostname, string alarm_id, string entity );
int alarmUtil_warning_log ( string hostname, string alarm_id, string entity, string prefix );
int alarmUtil_critical ( string hostname, string alarm_id, string entity, FMTimeT & timestamp );
int alarmUtil_major ( string hostname, string alarm_id, string entity, FMTimeT & timestamp );
int alarmUtil_minor ( string hostname, string alarm_id, string entity, FMTimeT & timestamp );
int alarmUtil_warning ( string hostname, string alarm_id, string entity, FMTimeT & timestamp );
int alarmUtil_critical_log ( string hostname, string alarm_id, string entity, FMTimeT & timestamp );
int alarmUtil_major_log ( string hostname, string alarm_id, string entity, FMTimeT & timestamp );
int alarmUtil_minor_log ( string hostname, string alarm_id, string entity, FMTimeT & timestamp );
int alarmUtil_warning_log ( string hostname, string alarm_id, string entity, string prefix, FMTimeT & timestamp );
#endif // _MODULE_PRIVATE_
#endif // __INCLUDE_ALARM_H__

View File

@ -31,6 +31,24 @@ using namespace std;
void daemon_sigchld_hdlr ( void ) { ; }
/*****************************************************************************
*
* Name : _fm_timestamp
*
* Purpose : Get a microsecond timestamp of the current time.
*
* Description: Used to record the time the alarm/log was requested
*
* Uses : FMTimeT from fmAPI.h
*
****************************************************************************/
FMTimeT _fm_timestamp ( void )
{
struct timespec ts;
clock_gettime(CLOCK_REALTIME, &ts);
return ( ts.tv_sec*1000000 + ts.tv_nsec/1000 );
}
/** Daemon timer handler */
void _timer_handler ( int sig, siginfo_t *si, void *uc)
@ -62,23 +80,19 @@ int alarmHdlr_request_handler ( char * msg_ptr )
if ( elements )
{
#define PARSE_FAILURE ((const char *)"failed to parse value for key")
string alarmid = "" ;
string hostname = "" ;
queue_entry_type entry ;
string alarm_req = "" ;
string operation = "" ;
string severity = "" ;
string entity = "" ;
string prefix = "" ;
string alarm_req = "" ;
for ( int i = 0 ; i < elements ; i++ )
{
if ( ( rc = jsonUtil_get_array_idx ( msg_ptr, MTCALARM_REQ_LABEL, i, alarm_req ) ) == PASS )
{
if (( rc = jsonUtil_get_key_val ( (char*)alarm_req.data(), MTCALARM_REQ_KEY__ALARMID, alarmid )) != PASS )
if (( rc = jsonUtil_get_key_val ( (char*)alarm_req.data(), MTCALARM_REQ_KEY__ALARMID, entry.alarmid )) != PASS )
{
elog ("%s '%s'\n", PARSE_FAILURE, MTCALARM_REQ_KEY__ALARMID);
}
else if (( rc = jsonUtil_get_key_val ( (char*)alarm_req.data(), MTCALARM_REQ_KEY__HOSTNAME, hostname )) != PASS )
else if (( rc = jsonUtil_get_key_val ( (char*)alarm_req.data(), MTCALARM_REQ_KEY__HOSTNAME, entry.hostname )) != PASS )
{
elog ("%s '%s'\n", PARSE_FAILURE, MTCALARM_REQ_KEY__HOSTNAME);
}
@ -90,23 +104,19 @@ int alarmHdlr_request_handler ( char * msg_ptr )
{
elog ("%s '%s'\n", PARSE_FAILURE, MTCALARM_REQ_KEY__SEVERITY);
}
else if (( rc = jsonUtil_get_key_val ( (char*)alarm_req.data(), MTCALARM_REQ_KEY__ENTITY, entity )) != PASS )
else if (( rc = jsonUtil_get_key_val ( (char*)alarm_req.data(), MTCALARM_REQ_KEY__ENTITY, entry.entity )) != PASS )
{
elog ("%s '%s'\n", PARSE_FAILURE, MTCALARM_REQ_KEY__ENTITY);
}
else if (( rc = jsonUtil_get_key_val ( (char*)alarm_req.data(), MTCALARM_REQ_KEY__PREFIX, prefix)) != PASS )
else if (( rc = jsonUtil_get_key_val ( (char*)alarm_req.data(), MTCALARM_REQ_KEY__PREFIX, entry.prefix)) != PASS )
{
elog ("%s '%s'\n", PARSE_FAILURE, MTCALARM_REQ_KEY__PREFIX);
}
else
{
jlog ("Alarm Message has %d requests\n", elements );
rc = alarmMgr_manage_alarm ( alarmid,
hostname,
tolowercase(operation),
tolowercase(severity),
entity,
prefix);
{ entry.timestamp = _fm_timestamp ();
entry.operation = tolowercase(operation);
entry.severity = tolowercase(severity);
alarmMgr_queue_alarm (entry);
}
if ( rc ) break ;
}

View File

@ -192,6 +192,13 @@ int daemon_init ( string iface, string nodeType_str )
void daemon_service_run ( void )
{
int rc = PASS ;
#ifdef WANT_FIT_TESTING
daemon_init_fit ();
#endif
alarmMgr_queue_clear();
if (( mtcalarm_req_sock_ptr ) && ( mtcalarm_req_sock_ptr->getFD() ))
{
std::list<int> socks ;
@ -213,7 +220,7 @@ void daemon_service_run ( void )
{
daemon_signal_hdlr ();
waitd.tv_sec = 0;
waitd.tv_usec = SOCKET_WAIT;
waitd.tv_usec = SOCKET_WAIT_100MS;
/* Initialize the master fd_set */
FD_ZERO(&readfds);
@ -269,6 +276,12 @@ void daemon_service_run ( void )
break ;
}
}
#ifdef WANT_FIT_TESTING
daemon_load_fit();
#endif
alarmMgr_service_queue();
}
}
else

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2016-2017 Wind River Systems, Inc.
* Copyright (c) 2016-2017,2019 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
@ -7,7 +7,7 @@
/**
* @file
* Wind River Titanium Cloud Maintenance Alarm Manager Daemon Manager
* Starling-X Maintenance Alarm Manager Daemon Manager
*/
#include <iostream>
@ -19,88 +19,297 @@ using namespace std;
#define __MODULE_PRIVATE__
#include "alarm.h" /* module header */
#include "daemon_common.h" /* for ... gettime_monotonic_nsec */
#include "alarm.h" /* module header */
int alarmMgr_manage_alarm ( string alarmid,
string hostname,
string operation,
string severity,
string entity,
string prefix)
/* Accomodate for MNFA heartbeat alarms.
* Up to 2 (Mgmnt and Cluster) for each node of up to 1000 nodes = 2000 */
#define MAX_QUEUED_ALARMS (2000)
/* the alarm queue */
static list<queue_entry_type> alarm_queue ;
/* FM retry throttle */
static unsigned long long _holdoff_timestamp = 0 ;
/*************************************************************************
*
* Name : _pop_front, _pop_back
*
* Scope : local
*
* Purpose : Remove the entry at the head/tail of the queue.
*
* Also reset the log throttle counter.
*
************************************************************************/
void _pop_front( void )
{
if ( alarm_queue.size() )
{
alarm_queue.pop_front();
}
_holdoff_timestamp = 0 ;
}
void _pop_back( void )
{
if ( alarm_queue.size() )
{
alarm_queue.pop_back();
}
_holdoff_timestamp = 0 ;
}
/*************************************************************************
*
* Name : alarmMgr_queue_clear
*
* Purpose : Clear the alarm queue ; called from init.
*
************************************************************************/
void alarmMgr_queue_clear ( void )
{
alarm_queue.clear();
}
/*************************************************************************
*
* Name : alarmMgr_queue_alarm
*
* Purpose : Add an incoming alarm request to the tail of the queue.
*
************************************************************************/
void alarmMgr_queue_alarm ( queue_entry_type entry )
{
alog ("%s adding %s to alarm queue [size=%ld]\n",
entry.hostname.c_str(),
entry.alarmid.c_str(),
alarm_queue.size() );
alarm_queue.push_back(entry);
}
/*************************************************************************
*
* Name : alarmMgr_service_queue
*
* Purpose : Service the alarm queue from the head.
*
* Description: Load the first/oldest element of the queue and submit it
* to FM.
*
* If it fails for a reason that is likely to resolve itself
* with a retry, then it is not popped of the head. Instead
* it is left there to be retried after the hold off period.
*
* If it fails for a reason that is NOT likely to succeed
* by retries then an error log is produced and this faulty
* entry is dropped. It is done this way to avoid a bad
* entry from stalling/blocking the queue.
*
************************************************************************/
/* 5 second holdoff time before FM retry */
#define RETRY_HOLDOFF_TIME_NSECS ((unsigned long long)(5000000000))
void alarmMgr_service_queue ( void )
{
alog1 ("Elements: %ld\n", alarm_queue.size());
if ( alarm_queue.empty() )
return ;
/* throttle access to FM if in retry mode */
if ( _holdoff_timestamp )
{
unsigned long long _now_time = gettime_monotonic_nsec ();
/* retry only retry every RETRY_HOLDOFF_TIME_NSECS while in holdoff */
if (( _now_time-_holdoff_timestamp ) < RETRY_HOLDOFF_TIME_NSECS)
return ;
else
_holdoff_timestamp = 0 ;
}
queue_entry_type entry = alarm_queue.front() ;
int rc = PASS ;
string action = operation ;
string action = entry.operation ;
action.append (" alarm");
EFmAlarmSeverityT sev ;
ilog ("Alarm: alarmid:%s hostname:%s operation:%s severity:%s entity:%s prefix:%s\n",
alarmid.c_str(),
hostname.c_str(),
operation.c_str(),
severity.c_str(),
entity.c_str(),
prefix.c_str());
alog ("%s %s operation:%s severity:%s entity:%s prefix:%s\n",
entry.hostname.c_str(),
entry.alarmid.c_str(),
entry.operation.c_str(),
entry.severity.c_str(),
entry.entity.c_str(),
entry.prefix.c_str());
sev = alarmUtil_getSev_enum ( severity );
if (!operation.compare("msg"))
EFmAlarmSeverityT sev = alarmUtil_getSev_enum ( entry.severity );
/* customer logs */
if ( entry.operation == "msg" )
{
if ( sev == FM_ALARM_SEVERITY_WARNING )
{
//if ( prefix.compare("none"))
alarmUtil_warning_log ( hostname, alarmid, entity, prefix );
//else
// mtcAlarm_warning_log ( hostname, id, entity );
rc = alarmUtil_warning_log ( entry.hostname, entry.alarmid, entry.entity, entry.prefix, entry.timestamp );
}
else if ( sev == FM_ALARM_SEVERITY_MINOR )
{
rc = alarmUtil_minor_log ( hostname, alarmid, entity );
rc = alarmUtil_minor_log ( entry.hostname, entry.alarmid, entry.entity, entry.timestamp );
}
else if ( sev == FM_ALARM_SEVERITY_MAJOR)
{
rc = alarmUtil_major_log ( hostname, alarmid, entity );
rc = alarmUtil_major_log ( entry.hostname, entry.alarmid, entry.entity, entry.timestamp );
}
else if ( sev == FM_ALARM_SEVERITY_CRITICAL )
{
rc = alarmUtil_critical_log ( hostname, alarmid, entity );
rc = alarmUtil_critical_log ( entry.hostname, entry.alarmid, entry.entity, entry.timestamp );
}
else
{
rc = FAIL_INVALID_OPERATION ;
wlog ("Unsupported log severity '%d:%s'\n", sev, severity.c_str());
rc = FM_ERR_INVALID_REQ ;
wlog ("Unsupported log severity '%d:%s'\n", sev, entry.severity.c_str());
}
action="create log" ;
}
/* Get the state */
else if ( !operation.compare("clear"))
/* alarm clear request */
else if ( entry.operation == "clear" )
{
rc = alarmUtil_clear ( hostname, alarmid, entity );
rc = alarmUtil_clear ( entry.hostname, entry.alarmid, entry.entity );
}
else if ( !operation.compare("set") )
/* alarm set request */
else if ( entry.operation == "set" )
{
if ( sev == FM_ALARM_SEVERITY_WARNING )
rc = alarmUtil_warning ( hostname, alarmid, entity );
rc = alarmUtil_warning ( entry.hostname, entry.alarmid, entry.entity, entry.timestamp );
else if ( sev == FM_ALARM_SEVERITY_MINOR )
rc = alarmUtil_minor ( hostname, alarmid, entity );
rc = alarmUtil_minor ( entry.hostname, entry.alarmid, entry.entity, entry.timestamp );
else if ( sev == FM_ALARM_SEVERITY_MAJOR )
rc = alarmUtil_major ( hostname, alarmid, entity );
rc = alarmUtil_major ( entry.hostname, entry.alarmid, entry.entity, entry.timestamp );
else if ( sev == FM_ALARM_SEVERITY_CRITICAL )
rc = alarmUtil_critical ( hostname, alarmid, entity );
rc = alarmUtil_critical ( entry.hostname, entry.alarmid, entry.entity, entry.timestamp );
else
{
rc = FAIL_INVALID_OPERATION ;
rc = FM_ERR_INVALID_REQ ;
}
}
else
{
rc = FAIL_BAD_CASE ;
rc = FM_ERR_INVALID_PARAMETER ;
}
if ( rc )
/* Handle behavior based on return code */
if ( rc == FM_ERR_OK )
{
elog ("%s failed to %s '%s:%s'\n", hostname.c_str(), action.c_str(), alarmid.c_str(), entity.c_str() )
/* alarm call succeeded, pop off the list. */
_pop_front();
}
return (rc);
}
else if ( rc == FM_ERR_ENTITY_NOT_FOUND )
{
ilog ("%s %s '%s:%s' ; not found",
entry.hostname.c_str(),
action.c_str(),
entry.alarmid.c_str(),
entry.entity.c_str());
_pop_front();
}
/*******************************************************************
* Now these are non-success cases.
*******************************************************************/
/* Most typical failure case first - FM not running */
else if (( rc == FM_ERR_NOCONNECT ) ||
( rc == FM_ERR_REQUEST_PENDING ) ||
( rc == FM_ERR_COMMUNICATIONS ))
{
if ( _holdoff_timestamp == 0 )
_holdoff_timestamp = gettime_monotonic_nsec();
string type = "" ;
if ( rc == FM_ERR_NOCONNECT ) type = "not connected" ;
else if ( rc == FM_ERR_COMMUNICATIONS ) type = "communication error" ;
else if ( rc == FM_ERR_REQUEST_PENDING ) type = "pending request" ;
wlog ("%s %s '%s:%s' failure ; %s ; retrying [q=%ld]",
entry.hostname.c_str(),
action.c_str(),
entry.alarmid.c_str(),
entry.entity.c_str(),
type.c_str(),
alarm_queue.size());
}
/* Look for cases where we don't want to retry.
*
* These would be cases that are unlikely to resolve with retry.
*/
/* pop off if alarm already asserted */
else if ( rc == FM_ERR_ALARM_EXISTS )
{
wlog ("%s %s '%s:%s' ; already exists",
entry.hostname.c_str(),
action.c_str(),
entry.alarmid.c_str(),
entry.entity.c_str());
_pop_front();
}
/* never retry on any of these error cases */
else if (( rc == FM_ERR_INVALID_REQ ) ||
( rc == FM_ERR_INVALID_ATTRIBUTE ) ||
( rc == FM_ERR_INVALID_PARAMETER ) ||
( rc == FM_ERR_DB_OPERATION_FAILURE ) ||
( rc == FM_ERR_RESOURCE_UNAVAILABLE ))
{
wlog ("%s failed to %s '%s:%s' ; dropped ; bad request [rc=%d]",
entry.hostname.c_str(),
action.c_str(),
entry.alarmid.c_str(),
entry.entity.c_str(), rc);
_pop_front();
}
/* never retry due to resource error on assert cases */
else if (( rc == FM_ERR_NOMEM ) ||
( rc == FM_ERR_SERVER_NO_MEM ) ||
( rc == FM_ERR_NOT_ENOUGH_SPACE ))
{
wlog ("%s failed to %s '%s:%s' ; dropped ; resource error [rc=%d]",
entry.hostname.c_str(),
action.c_str(),
entry.alarmid.c_str(),
entry.entity.c_str(),rc );
_pop_front();
}
else
{
wlog ("%s failed to %s '%s:%s' ; dropped ; unexpected [rc=%d]",
entry.hostname.c_str(),
action.c_str(),
entry.alarmid.c_str(),
entry.entity.c_str(),rc );
_pop_front();
}
/* pop from back if the queue is loaded to the max */
if ( alarm_queue.size() > MAX_QUEUED_ALARMS )
{
wlog ("%s %s '%s:%s' dropped ; most recent ; queue full",
entry.hostname.c_str(),
action.c_str(),
entry.alarmid.c_str(),
entry.entity.c_str() );
_pop_back();
}
else
{
ilog ("%ld queue entries to service", alarm_queue.size());
}
}

View File

@ -217,6 +217,7 @@ int alarmUtil_query_identity ( string identity, SFmAlarmDataT * alarm_list_ptr,
*
*
********************************************************************************/
int alarmUtil ( string & hostname,
string & identity,
string & instance,
@ -280,28 +281,26 @@ int alarmUtil ( string & hostname,
alarm.service_affecting ? 'Y' : 'N',
alarm.suppression ? 'Y' : 'N' );
ilog ( "fm_set_fault: %s %s state:%d sev:%d type:%d cause:%d sa:%c supp:%c",
hostname.c_str(),
alarm.alarm_id,
alarm.alarm_state,
alarm.severity,
alarm.alarm_type,
alarm.probable_cause,
alarm.service_affecting ? 'Y' : 'N',
alarm.suppression ? 'Y' : 'N' );
rc = fm_set_fault ( &alarm , NULL );
if ( rc != FM_ERR_OK )
#ifdef WANT_FIT_TESTING
if (( daemon_is_file_present ( MTC_CMD_FIT__FM_ERROR_CODE )) &&
( daemon_want_fit ( FIT_CODE__FM_SET_ALARM, hostname )))
{
wlog ("%s fm_set_fault call failed for alarm %s (rc:%d) ; retrying\n", hostname.c_str(), alarm.alarm_id, rc);
usleep (100000); /* sleep 100 msec */
rc = fm_set_fault ( &alarm , NULL );
if ( rc != FM_ERR_OK )
{
elog ("%s failed to set alarm %s (rc:%d) ; giving up\n", hostname.c_str(), alarm.alarm_id, rc);
rc = FAIL ;
}
rc = daemon_get_file_int(MTC_CMD_FIT__FM_ERROR_CODE) ;
}
else
#endif
{
rc = fm_set_fault ( &alarm , NULL );
}
if ( rc == FM_ERR_OK )
{
ilog ( "%s %s %s alarm raised (%s)",
hostname.c_str(),
alarm.alarm_id,
alarm.entity_instance_id,
alarmUtil_getSev_str(alarm.severity).c_str());
}
/* error cases are handled/logged in the caller's ; dequeue API */
}
else
{
@ -313,19 +312,26 @@ int alarmUtil ( string & hostname,
alog ( "fm_clear_fault: %s %s:%s", hostname.c_str(), alarm.entity_instance_id, alarm.alarm_id );
ilog ("%s clearing %s %s alarm\n", hostname.c_str(), alarm.alarm_id, alarm.entity_instance_id);
if ( ( rc = fm_clear_fault ( &filter )) != FM_ERR_OK )
#ifdef WANT_FIT_TESTING
if (( daemon_is_file_present ( MTC_CMD_FIT__FM_ERROR_CODE )) &&
( daemon_want_fit ( FIT_CODE__FM_CLR_ALARM, hostname )))
{
if ( rc != FM_ERR_ENTITY_NOT_FOUND )
{
elog ("%s failed to fm_clear_fault (rc:%d)\n", hostname.c_str(), rc );
rc = FAIL ;
}
else
{
rc = PASS ;
}
rc = daemon_get_file_int(MTC_CMD_FIT__FM_ERROR_CODE) ;
}
else
#endif
{
rc = fm_clear_fault ( &filter );
}
if ( rc == FM_ERR_OK )
{
ilog ("%s %s %s alarm cleared\n",
hostname.c_str(),
alarm.alarm_id,
alarm.entity_instance_id);
}
/* error cases are handled/logged in the caller's ; dequeue API */
}
return (rc);
@ -348,7 +354,7 @@ int alarmUtil_clear ( string hostname, string alarm_id , string entity )
}
/** Assert a specified hosts's alarm with a CRITICAL severity level */
int alarmUtil_critical ( string hostname, string alarm_id , string entity )
int alarmUtil_critical ( string hostname, string alarm_id , string entity, FMTimeT & timestamp )
{
alarmUtil_type * alarm_ptr = alarmData_getAlarm_ptr(alarm_id);
if ( alarm_ptr )
@ -358,6 +364,7 @@ int alarmUtil_critical ( string hostname, string alarm_id , string entity )
alarm_ptr->alarm.severity = FM_ALARM_SEVERITY_CRITICAL ;
alarm_ptr->alarm.alarm_state = FM_ALARM_STATE_SET ;
if ( timestamp ) alarm_ptr->alarm.timestamp = timestamp ;
snprintf ( alarm_ptr->alarm.reason_text, FM_MAX_BUFFER_LENGTH, "%s %s", hostname.data(), alarm_ptr->critl_reason.data());
@ -368,7 +375,7 @@ int alarmUtil_critical ( string hostname, string alarm_id , string entity )
/** Assert a specified host's alarm with a MAJOR severity level */
int alarmUtil_major ( string hostname, string alarm_id , string entity )
int alarmUtil_major ( string hostname, string alarm_id , string entity, FMTimeT & timestamp )
{
alarmUtil_type * alarm_ptr = alarmData_getAlarm_ptr(alarm_id);
if ( alarm_ptr )
@ -378,6 +385,7 @@ int alarmUtil_major ( string hostname, string alarm_id , string entity )
alarm_ptr->alarm.severity = FM_ALARM_SEVERITY_MAJOR ;
alarm_ptr->alarm.alarm_state = FM_ALARM_STATE_SET ;
if ( timestamp ) alarm_ptr->alarm.timestamp = timestamp ;
snprintf ( alarm_ptr->alarm.reason_text, FM_MAX_BUFFER_LENGTH, "%s %s", hostname.data(), alarm_ptr->major_reason.data());
@ -385,8 +393,9 @@ int alarmUtil_major ( string hostname, string alarm_id , string entity )
}
return (FAIL_NULL_POINTER);
}
/** Assert a specified host's alarm with a MINOR severity level */
int alarmUtil_minor ( string hostname, string alarm_id , string entity )
int alarmUtil_minor ( string hostname, string alarm_id , string entity, FMTimeT & timestamp )
{
alarmUtil_type * alarm_ptr = alarmData_getAlarm_ptr(alarm_id);
if ( alarm_ptr )
@ -396,6 +405,7 @@ int alarmUtil_minor ( string hostname, string alarm_id , string entity )
alarm_ptr->alarm.severity = FM_ALARM_SEVERITY_MINOR ;
alarm_ptr->alarm.alarm_state = FM_ALARM_STATE_SET ;
if ( timestamp ) alarm_ptr->alarm.timestamp = timestamp ;
snprintf ( alarm_ptr->alarm.reason_text, FM_MAX_BUFFER_LENGTH, "%s %s", hostname.data(), alarm_ptr->minor_reason.data());
@ -405,7 +415,7 @@ int alarmUtil_minor ( string hostname, string alarm_id , string entity )
}
/** Assert a specified host's alarm with a WARNING severity level */
int alarmUtil_warning ( string hostname, string alarm_id , string entity )
int alarmUtil_warning ( string hostname, string alarm_id , string entity, FMTimeT & timestamp )
{
alarmUtil_type * alarm_ptr = alarmData_getAlarm_ptr(alarm_id);
if ( alarm_ptr )
@ -415,6 +425,7 @@ int alarmUtil_warning ( string hostname, string alarm_id , string entity
alarm_ptr->alarm.severity = FM_ALARM_SEVERITY_WARNING ;
alarm_ptr->alarm.alarm_state = FM_ALARM_STATE_SET ;
if ( timestamp ) alarm_ptr->alarm.timestamp = timestamp ;
snprintf ( alarm_ptr->alarm.reason_text, FM_MAX_BUFFER_LENGTH, "%s %s", hostname.data(), alarm_ptr->minor_reason.data());
@ -424,7 +435,7 @@ int alarmUtil_warning ( string hostname, string alarm_id , string entity
}
/** Create CRITICAL log */
int alarmUtil_critical_log ( string hostname, string alarm_id , string entity )
int alarmUtil_critical_log ( string hostname, string alarm_id , string entity, FMTimeT & timestamp )
{
alarmUtil_type * alarm_ptr = alarmData_getAlarm_ptr(alarm_id);
if ( alarm_ptr )
@ -434,6 +445,7 @@ int alarmUtil_critical_log ( string hostname, string alarm_id , string entity )
alarm_ptr->alarm.severity = FM_ALARM_SEVERITY_CRITICAL ;
alarm_ptr->alarm.alarm_state = FM_ALARM_STATE_MSG ;
if ( timestamp ) alarm_ptr->alarm.timestamp = timestamp ;
snprintf ( alarm_ptr->alarm.reason_text, FM_MAX_BUFFER_LENGTH, "%s %s", hostname.data(), alarm_ptr->critl_reason.data());
@ -444,7 +456,7 @@ int alarmUtil_critical_log ( string hostname, string alarm_id , string entity )
/** Create MAJOR log */
int alarmUtil_major_log ( string hostname, string alarm_id , string entity )
int alarmUtil_major_log ( string hostname, string alarm_id , string entity, FMTimeT & timestamp )
{
alarmUtil_type * alarm_ptr = alarmData_getAlarm_ptr(alarm_id);
if ( alarm_ptr )
@ -454,6 +466,7 @@ int alarmUtil_major_log ( string hostname, string alarm_id , string entity )
alarm_ptr->alarm.severity = FM_ALARM_SEVERITY_MAJOR ;
alarm_ptr->alarm.alarm_state = FM_ALARM_STATE_MSG ;
if ( timestamp ) alarm_ptr->alarm.timestamp = timestamp ;
snprintf ( alarm_ptr->alarm.reason_text, FM_MAX_BUFFER_LENGTH, "%s %s", hostname.data(), alarm_ptr->major_reason.data());
@ -462,7 +475,7 @@ int alarmUtil_major_log ( string hostname, string alarm_id , string entity )
return (FAIL_NULL_POINTER);
}
/** Create MINOR log */
int alarmUtil_minor_log ( string hostname, string alarm_id , string entity )
int alarmUtil_minor_log ( string hostname, string alarm_id , string entity, FMTimeT & timestamp )
{
alarmUtil_type * alarm_ptr = alarmData_getAlarm_ptr(alarm_id);
if ( alarm_ptr )
@ -472,6 +485,7 @@ int alarmUtil_minor_log ( string hostname, string alarm_id , string entit
alarm_ptr->alarm.severity = FM_ALARM_SEVERITY_MINOR ;
alarm_ptr->alarm.alarm_state = FM_ALARM_STATE_MSG ;
if ( timestamp ) alarm_ptr->alarm.timestamp = timestamp ;
snprintf ( alarm_ptr->alarm.reason_text, FM_MAX_BUFFER_LENGTH, "%s %s", hostname.data(), alarm_ptr->minor_reason.data());
@ -481,7 +495,7 @@ int alarmUtil_minor_log ( string hostname, string alarm_id , string entit
}
/** Create WARNING log */
int alarmUtil_warning_log ( string hostname, string alarm_id, string entity, string prefix )
int alarmUtil_warning_log ( string hostname, string alarm_id, string entity, string prefix, FMTimeT & timestamp )
{
alarmUtil_type * alarm_ptr = alarmData_getAlarm_ptr(alarm_id);
if ( alarm_ptr )
@ -491,6 +505,7 @@ int alarmUtil_warning_log ( string hostname, string alarm_id, string entity, str
alarm_ptr->alarm.severity = FM_ALARM_SEVERITY_WARNING ;
alarm_ptr->alarm.alarm_state = FM_ALARM_STATE_MSG ;
if ( timestamp ) alarm_ptr->alarm.timestamp = timestamp ;
snprintf ( alarm_ptr->alarm.reason_text, FM_MAX_BUFFER_LENGTH, "%s %s", hostname.data(), entity.data());

View File

@ -650,6 +650,7 @@ private:
/** @} private_monitoring_services_variables */
/* List of alarms and current severity */
#define MAX_ALARMS (10)
EFmAlarmSeverityT alarms[MAX_ALARMS];
/* tracks whether the alarms for this host have been loaded already or not */