diff --git a/mtce-common/src/common/fitCodes.h b/mtce-common/src/common/fitCodes.h index 10abb498..e53ed254 100644 --- a/mtce-common/src/common/fitCodes.h +++ b/mtce-common/src/common/fitCodes.h @@ -67,6 +67,7 @@ #define MTC_CMD_FIT__JSON_LEAK_SOAK ("/var/run/fit/json_leak_soak") /* mtcAgent */ #define MTC_CMD_FIT__BMC_ACC_FAIL ("/var/run/fit/bmc_access_fail")/* mtcAgent */ #define MTC_CMD_FIT__MEM_LEAK_DEBUG ("/var/run/fit/mem_leak_debug")/* mtcAgent */ +#define MTC_CMD_FIT__FM_ERROR_CODE ("/var/run/fit/fm_error_code") /* mtcAgent */ /***************************************************** * Fault Insertion Codes @@ -120,7 +121,8 @@ #define FIT_CODE__FM_SET_ALARM (40) #define FIT_CODE__FM_GET_ALARM (41) -#define FIT_CODE__FM_QRY_ALARMS (42) +#define FIT_CODE__FM_CLR_ALARM (42) +#define FIT_CODE__FM_QRY_ALARMS (43) #define FIT_CODE__BMC_COMMAND_SEND (60) #define FIT_CODE__BMC_COMMAND_RECV (61) diff --git a/mtce-common/src/common/nodeBase.h b/mtce-common/src/common/nodeBase.h index f56161ca..da71cd08 100755 --- a/mtce-common/src/common/nodeBase.h +++ b/mtce-common/src/common/nodeBase.h @@ -371,6 +371,8 @@ typedef enum /* 50 milliseconds */ #define SOCKET_WAIT 50000 +#define SOCKET_WAIT_100MS (100000) + /* 5 milliseconds */ #define MTCAGENT_SELECT_TIMEOUT (5000) diff --git a/mtce/src/alarm/alarm.h b/mtce/src/alarm/alarm.h index 1a15830e..2ce56391 100644 --- a/mtce/src/alarm/alarm.h +++ b/mtce/src/alarm/alarm.h @@ -23,7 +23,6 @@ #define ENTITY_PREFIX ((const char *)"host=") -#define MAX_ALARMS (10) #define MAX_ALARM_REQ_PER_MSG (4) #define MAX_ALARM_REQ_MSG_SIZE (500) #define MAX_ALARM_REQ_SIZE (MAX_ALARM_REQ_PER_MSG*MAX_ALARM_REQ_MSG_SIZE) @@ -97,6 +96,17 @@ typedef struct string clear_reason ; } alarmUtil_type ; +typedef struct +{ + string alarmid ; + string hostname ; + string operation ; + string severity ; + string entity ; + string prefix ; + FMTimeT timestamp ; + +} queue_entry_type; #define MAX_FAILED_B2B_RECEIVES_B4_RESTART (5) @@ -130,13 +140,9 @@ alarmUtil_type * alarmData_getAlarm_ptr ( string alarm_id_str ); /* in alarmHdlr.cpp */ int alarmHdlr_request_handler ( char * msg_ptr ); -/* in alarmMgr.cpp */ -int alarmMgr_manage_alarm ( string alarmid , - string hostname, - string operation, - string severity, - string entity, - string prefix); +void alarmMgr_queue_clear ( void ); +void alarmMgr_queue_alarm (queue_entry_type entry); +void alarmMgr_service_queue(void); /* Clear all alarms against this host */ void alarmUtil_clear_all ( string hostname ); @@ -154,14 +160,14 @@ int alarmUtil_query_identity ( string identity, unsigned int alarms_max ); int alarmUtil_clear ( string hostname, string alarm_id, string entity ); -int alarmUtil_critical ( string hostname, string alarm_id, string entity ); -int alarmUtil_major ( string hostname, string alarm_id, string entity ); -int alarmUtil_minor ( string hostname, string alarm_id, string entity ); -int alarmUtil_warning ( string hostname, string alarm_id, string entity ); -int alarmUtil_critical_log ( string hostname, string alarm_id, string entity ); -int alarmUtil_major_log ( string hostname, string alarm_id, string entity ); -int alarmUtil_minor_log ( string hostname, string alarm_id, string entity ); -int alarmUtil_warning_log ( string hostname, string alarm_id, string entity, string prefix ); +int alarmUtil_critical ( string hostname, string alarm_id, string entity, FMTimeT & timestamp ); +int alarmUtil_major ( string hostname, string alarm_id, string entity, FMTimeT & timestamp ); +int alarmUtil_minor ( string hostname, string alarm_id, string entity, FMTimeT & timestamp ); +int alarmUtil_warning ( string hostname, string alarm_id, string entity, FMTimeT & timestamp ); +int alarmUtil_critical_log ( string hostname, string alarm_id, string entity, FMTimeT & timestamp ); +int alarmUtil_major_log ( string hostname, string alarm_id, string entity, FMTimeT & timestamp ); +int alarmUtil_minor_log ( string hostname, string alarm_id, string entity, FMTimeT & timestamp ); +int alarmUtil_warning_log ( string hostname, string alarm_id, string entity, string prefix, FMTimeT & timestamp ); #endif // _MODULE_PRIVATE_ #endif // __INCLUDE_ALARM_H__ diff --git a/mtce/src/alarm/alarmHdlr.cpp b/mtce/src/alarm/alarmHdlr.cpp index 8c2f7438..e0bb128c 100644 --- a/mtce/src/alarm/alarmHdlr.cpp +++ b/mtce/src/alarm/alarmHdlr.cpp @@ -31,6 +31,24 @@ using namespace std; void daemon_sigchld_hdlr ( void ) { ; } +/***************************************************************************** + * + * Name : _fm_timestamp + * + * Purpose : Get a microsecond timestamp of the current time. + * + * Description: Used to record the time the alarm/log was requested + * + * Uses : FMTimeT from fmAPI.h + * + ****************************************************************************/ + +FMTimeT _fm_timestamp ( void ) +{ + struct timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); + return ( ts.tv_sec*1000000 + ts.tv_nsec/1000 ); +} /** Daemon timer handler */ void _timer_handler ( int sig, siginfo_t *si, void *uc) @@ -62,23 +80,19 @@ int alarmHdlr_request_handler ( char * msg_ptr ) if ( elements ) { #define PARSE_FAILURE ((const char *)"failed to parse value for key") - string alarmid = "" ; - string hostname = "" ; + queue_entry_type entry ; + string alarm_req = "" ; string operation = "" ; string severity = "" ; - string entity = "" ; - string prefix = "" ; - string alarm_req = "" ; - for ( int i = 0 ; i < elements ; i++ ) { if ( ( rc = jsonUtil_get_array_idx ( msg_ptr, MTCALARM_REQ_LABEL, i, alarm_req ) ) == PASS ) { - if (( rc = jsonUtil_get_key_val ( (char*)alarm_req.data(), MTCALARM_REQ_KEY__ALARMID, alarmid )) != PASS ) + if (( rc = jsonUtil_get_key_val ( (char*)alarm_req.data(), MTCALARM_REQ_KEY__ALARMID, entry.alarmid )) != PASS ) { elog ("%s '%s'\n", PARSE_FAILURE, MTCALARM_REQ_KEY__ALARMID); } - else if (( rc = jsonUtil_get_key_val ( (char*)alarm_req.data(), MTCALARM_REQ_KEY__HOSTNAME, hostname )) != PASS ) + else if (( rc = jsonUtil_get_key_val ( (char*)alarm_req.data(), MTCALARM_REQ_KEY__HOSTNAME, entry.hostname )) != PASS ) { elog ("%s '%s'\n", PARSE_FAILURE, MTCALARM_REQ_KEY__HOSTNAME); } @@ -90,23 +104,19 @@ int alarmHdlr_request_handler ( char * msg_ptr ) { elog ("%s '%s'\n", PARSE_FAILURE, MTCALARM_REQ_KEY__SEVERITY); } - else if (( rc = jsonUtil_get_key_val ( (char*)alarm_req.data(), MTCALARM_REQ_KEY__ENTITY, entity )) != PASS ) + else if (( rc = jsonUtil_get_key_val ( (char*)alarm_req.data(), MTCALARM_REQ_KEY__ENTITY, entry.entity )) != PASS ) { elog ("%s '%s'\n", PARSE_FAILURE, MTCALARM_REQ_KEY__ENTITY); } - else if (( rc = jsonUtil_get_key_val ( (char*)alarm_req.data(), MTCALARM_REQ_KEY__PREFIX, prefix)) != PASS ) + else if (( rc = jsonUtil_get_key_val ( (char*)alarm_req.data(), MTCALARM_REQ_KEY__PREFIX, entry.prefix)) != PASS ) { elog ("%s '%s'\n", PARSE_FAILURE, MTCALARM_REQ_KEY__PREFIX); } else - { - jlog ("Alarm Message has %d requests\n", elements ); - rc = alarmMgr_manage_alarm ( alarmid, - hostname, - tolowercase(operation), - tolowercase(severity), - entity, - prefix); + { entry.timestamp = _fm_timestamp (); + entry.operation = tolowercase(operation); + entry.severity = tolowercase(severity); + alarmMgr_queue_alarm (entry); } if ( rc ) break ; } diff --git a/mtce/src/alarm/alarmInit.cpp b/mtce/src/alarm/alarmInit.cpp index dcf29332..7da739d6 100644 --- a/mtce/src/alarm/alarmInit.cpp +++ b/mtce/src/alarm/alarmInit.cpp @@ -192,6 +192,13 @@ int daemon_init ( string iface, string nodeType_str ) void daemon_service_run ( void ) { int rc = PASS ; + +#ifdef WANT_FIT_TESTING + daemon_init_fit (); +#endif + + alarmMgr_queue_clear(); + if (( mtcalarm_req_sock_ptr ) && ( mtcalarm_req_sock_ptr->getFD() )) { std::list socks ; @@ -213,7 +220,7 @@ void daemon_service_run ( void ) { daemon_signal_hdlr (); waitd.tv_sec = 0; - waitd.tv_usec = SOCKET_WAIT; + waitd.tv_usec = SOCKET_WAIT_100MS; /* Initialize the master fd_set */ FD_ZERO(&readfds); @@ -269,6 +276,12 @@ void daemon_service_run ( void ) break ; } } + +#ifdef WANT_FIT_TESTING + daemon_load_fit(); +#endif + + alarmMgr_service_queue(); } } else diff --git a/mtce/src/alarm/alarmMgr.cpp b/mtce/src/alarm/alarmMgr.cpp index 817e20b1..2d196222 100644 --- a/mtce/src/alarm/alarmMgr.cpp +++ b/mtce/src/alarm/alarmMgr.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2017 Wind River Systems, Inc. + * Copyright (c) 2016-2017,2019 Wind River Systems, Inc. * * SPDX-License-Identifier: Apache-2.0 * @@ -7,7 +7,7 @@ /** * @file - * Wind River Titanium Cloud Maintenance Alarm Manager Daemon Manager + * Starling-X Maintenance Alarm Manager Daemon Manager */ #include @@ -19,88 +19,297 @@ using namespace std; #define __MODULE_PRIVATE__ -#include "alarm.h" /* module header */ +#include "daemon_common.h" /* for ... gettime_monotonic_nsec */ +#include "alarm.h" /* module header */ -int alarmMgr_manage_alarm ( string alarmid, - string hostname, - string operation, - string severity, - string entity, - string prefix) +/* Accomodate for MNFA heartbeat alarms. + * Up to 2 (Mgmnt and Cluster) for each node of up to 1000 nodes = 2000 */ +#define MAX_QUEUED_ALARMS (2000) + +/* the alarm queue */ +static list alarm_queue ; + +/* FM retry throttle */ +static unsigned long long _holdoff_timestamp = 0 ; + +/************************************************************************* + * + * Name : _pop_front, _pop_back + * + * Scope : local + * + * Purpose : Remove the entry at the head/tail of the queue. + * + * Also reset the log throttle counter. + * + ************************************************************************/ + +void _pop_front( void ) { + if ( alarm_queue.size() ) + { + alarm_queue.pop_front(); + } + _holdoff_timestamp = 0 ; +} + +void _pop_back( void ) +{ + if ( alarm_queue.size() ) + { + alarm_queue.pop_back(); + } + _holdoff_timestamp = 0 ; +} + +/************************************************************************* + * + * Name : alarmMgr_queue_clear + * + * Purpose : Clear the alarm queue ; called from init. + * + ************************************************************************/ +void alarmMgr_queue_clear ( void ) +{ + alarm_queue.clear(); +} + +/************************************************************************* + * + * Name : alarmMgr_queue_alarm + * + * Purpose : Add an incoming alarm request to the tail of the queue. + * + ************************************************************************/ +void alarmMgr_queue_alarm ( queue_entry_type entry ) +{ + alog ("%s adding %s to alarm queue [size=%ld]\n", + entry.hostname.c_str(), + entry.alarmid.c_str(), + alarm_queue.size() ); + + alarm_queue.push_back(entry); +} + +/************************************************************************* + * + * Name : alarmMgr_service_queue + * + * Purpose : Service the alarm queue from the head. + * + * Description: Load the first/oldest element of the queue and submit it + * to FM. + * + * If it fails for a reason that is likely to resolve itself + * with a retry, then it is not popped of the head. Instead + * it is left there to be retried after the hold off period. + * + * If it fails for a reason that is NOT likely to succeed + * by retries then an error log is produced and this faulty + * entry is dropped. It is done this way to avoid a bad + * entry from stalling/blocking the queue. + * + ************************************************************************/ + +/* 5 second holdoff time before FM retry */ +#define RETRY_HOLDOFF_TIME_NSECS ((unsigned long long)(5000000000)) + +void alarmMgr_service_queue ( void ) +{ + alog1 ("Elements: %ld\n", alarm_queue.size()); + if ( alarm_queue.empty() ) + return ; + + /* throttle access to FM if in retry mode */ + if ( _holdoff_timestamp ) + { + unsigned long long _now_time = gettime_monotonic_nsec (); + + /* retry only retry every RETRY_HOLDOFF_TIME_NSECS while in holdoff */ + if (( _now_time-_holdoff_timestamp ) < RETRY_HOLDOFF_TIME_NSECS) + return ; + else + _holdoff_timestamp = 0 ; + } + + queue_entry_type entry = alarm_queue.front() ; + int rc = PASS ; - string action = operation ; + string action = entry.operation ; action.append (" alarm"); - EFmAlarmSeverityT sev ; - ilog ("Alarm: alarmid:%s hostname:%s operation:%s severity:%s entity:%s prefix:%s\n", - alarmid.c_str(), - hostname.c_str(), - operation.c_str(), - severity.c_str(), - entity.c_str(), - prefix.c_str()); + alog ("%s %s operation:%s severity:%s entity:%s prefix:%s\n", + entry.hostname.c_str(), + entry.alarmid.c_str(), + entry.operation.c_str(), + entry.severity.c_str(), + entry.entity.c_str(), + entry.prefix.c_str()); - sev = alarmUtil_getSev_enum ( severity ); - if (!operation.compare("msg")) + EFmAlarmSeverityT sev = alarmUtil_getSev_enum ( entry.severity ); + + /* customer logs */ + if ( entry.operation == "msg" ) { if ( sev == FM_ALARM_SEVERITY_WARNING ) { - //if ( prefix.compare("none")) - alarmUtil_warning_log ( hostname, alarmid, entity, prefix ); - //else - // mtcAlarm_warning_log ( hostname, id, entity ); + rc = alarmUtil_warning_log ( entry.hostname, entry.alarmid, entry.entity, entry.prefix, entry.timestamp ); } else if ( sev == FM_ALARM_SEVERITY_MINOR ) { - rc = alarmUtil_minor_log ( hostname, alarmid, entity ); + rc = alarmUtil_minor_log ( entry.hostname, entry.alarmid, entry.entity, entry.timestamp ); } else if ( sev == FM_ALARM_SEVERITY_MAJOR) { - rc = alarmUtil_major_log ( hostname, alarmid, entity ); + rc = alarmUtil_major_log ( entry.hostname, entry.alarmid, entry.entity, entry.timestamp ); } else if ( sev == FM_ALARM_SEVERITY_CRITICAL ) { - rc = alarmUtil_critical_log ( hostname, alarmid, entity ); + rc = alarmUtil_critical_log ( entry.hostname, entry.alarmid, entry.entity, entry.timestamp ); } else { - rc = FAIL_INVALID_OPERATION ; - wlog ("Unsupported log severity '%d:%s'\n", sev, severity.c_str()); + rc = FM_ERR_INVALID_REQ ; + wlog ("Unsupported log severity '%d:%s'\n", sev, entry.severity.c_str()); } action="create log" ; } - /* Get the state */ - else if ( !operation.compare("clear")) + /* alarm clear request */ + else if ( entry.operation == "clear" ) { - rc = alarmUtil_clear ( hostname, alarmid, entity ); + rc = alarmUtil_clear ( entry.hostname, entry.alarmid, entry.entity ); } - else if ( !operation.compare("set") ) + /* alarm set request */ + else if ( entry.operation == "set" ) { if ( sev == FM_ALARM_SEVERITY_WARNING ) - rc = alarmUtil_warning ( hostname, alarmid, entity ); + rc = alarmUtil_warning ( entry.hostname, entry.alarmid, entry.entity, entry.timestamp ); else if ( sev == FM_ALARM_SEVERITY_MINOR ) - rc = alarmUtil_minor ( hostname, alarmid, entity ); + rc = alarmUtil_minor ( entry.hostname, entry.alarmid, entry.entity, entry.timestamp ); else if ( sev == FM_ALARM_SEVERITY_MAJOR ) - rc = alarmUtil_major ( hostname, alarmid, entity ); + rc = alarmUtil_major ( entry.hostname, entry.alarmid, entry.entity, entry.timestamp ); else if ( sev == FM_ALARM_SEVERITY_CRITICAL ) - rc = alarmUtil_critical ( hostname, alarmid, entity ); + rc = alarmUtil_critical ( entry.hostname, entry.alarmid, entry.entity, entry.timestamp ); else { - rc = FAIL_INVALID_OPERATION ; + rc = FM_ERR_INVALID_REQ ; } } else { - rc = FAIL_BAD_CASE ; + rc = FM_ERR_INVALID_PARAMETER ; } - if ( rc ) + + /* Handle behavior based on return code */ + if ( rc == FM_ERR_OK ) { - elog ("%s failed to %s '%s:%s'\n", hostname.c_str(), action.c_str(), alarmid.c_str(), entity.c_str() ) + /* alarm call succeeded, pop off the list. */ + _pop_front(); } - return (rc); -} + else if ( rc == FM_ERR_ENTITY_NOT_FOUND ) + { + ilog ("%s %s '%s:%s' ; not found", + entry.hostname.c_str(), + action.c_str(), + entry.alarmid.c_str(), + entry.entity.c_str()); + _pop_front(); + } + /******************************************************************* + * Now these are non-success cases. + *******************************************************************/ + + /* Most typical failure case first - FM not running */ + else if (( rc == FM_ERR_NOCONNECT ) || + ( rc == FM_ERR_REQUEST_PENDING ) || + ( rc == FM_ERR_COMMUNICATIONS )) + { + if ( _holdoff_timestamp == 0 ) + _holdoff_timestamp = gettime_monotonic_nsec(); + + string type = "" ; + if ( rc == FM_ERR_NOCONNECT ) type = "not connected" ; + else if ( rc == FM_ERR_COMMUNICATIONS ) type = "communication error" ; + else if ( rc == FM_ERR_REQUEST_PENDING ) type = "pending request" ; + + wlog ("%s %s '%s:%s' failure ; %s ; retrying [q=%ld]", + entry.hostname.c_str(), + action.c_str(), + entry.alarmid.c_str(), + entry.entity.c_str(), + type.c_str(), + alarm_queue.size()); + } + + /* Look for cases where we don't want to retry. + * + * These would be cases that are unlikely to resolve with retry. + */ + + /* pop off if alarm already asserted */ + else if ( rc == FM_ERR_ALARM_EXISTS ) + { + wlog ("%s %s '%s:%s' ; already exists", + entry.hostname.c_str(), + action.c_str(), + entry.alarmid.c_str(), + entry.entity.c_str()); + _pop_front(); + } + + /* never retry on any of these error cases */ + else if (( rc == FM_ERR_INVALID_REQ ) || + ( rc == FM_ERR_INVALID_ATTRIBUTE ) || + ( rc == FM_ERR_INVALID_PARAMETER ) || + ( rc == FM_ERR_DB_OPERATION_FAILURE ) || + ( rc == FM_ERR_RESOURCE_UNAVAILABLE )) + { + wlog ("%s failed to %s '%s:%s' ; dropped ; bad request [rc=%d]", + entry.hostname.c_str(), + action.c_str(), + entry.alarmid.c_str(), + entry.entity.c_str(), rc); + _pop_front(); + } + + /* never retry due to resource error on assert cases */ + else if (( rc == FM_ERR_NOMEM ) || + ( rc == FM_ERR_SERVER_NO_MEM ) || + ( rc == FM_ERR_NOT_ENOUGH_SPACE )) + { + wlog ("%s failed to %s '%s:%s' ; dropped ; resource error [rc=%d]", + entry.hostname.c_str(), + action.c_str(), + entry.alarmid.c_str(), + entry.entity.c_str(),rc ); + _pop_front(); + } + else + { + wlog ("%s failed to %s '%s:%s' ; dropped ; unexpected [rc=%d]", + entry.hostname.c_str(), + action.c_str(), + entry.alarmid.c_str(), + entry.entity.c_str(),rc ); + _pop_front(); + } + + /* pop from back if the queue is loaded to the max */ + if ( alarm_queue.size() > MAX_QUEUED_ALARMS ) + { + wlog ("%s %s '%s:%s' dropped ; most recent ; queue full", + entry.hostname.c_str(), + action.c_str(), + entry.alarmid.c_str(), + entry.entity.c_str() ); + _pop_back(); + } + else + { + ilog ("%ld queue entries to service", alarm_queue.size()); + } +} diff --git a/mtce/src/alarm/alarmUtil.cpp b/mtce/src/alarm/alarmUtil.cpp index 478cf8e7..5c0c2c92 100644 --- a/mtce/src/alarm/alarmUtil.cpp +++ b/mtce/src/alarm/alarmUtil.cpp @@ -217,6 +217,7 @@ int alarmUtil_query_identity ( string identity, SFmAlarmDataT * alarm_list_ptr, * * ********************************************************************************/ + int alarmUtil ( string & hostname, string & identity, string & instance, @@ -280,28 +281,26 @@ int alarmUtil ( string & hostname, alarm.service_affecting ? 'Y' : 'N', alarm.suppression ? 'Y' : 'N' ); - ilog ( "fm_set_fault: %s %s state:%d sev:%d type:%d cause:%d sa:%c supp:%c", - hostname.c_str(), - alarm.alarm_id, - alarm.alarm_state, - alarm.severity, - alarm.alarm_type, - alarm.probable_cause, - alarm.service_affecting ? 'Y' : 'N', - alarm.suppression ? 'Y' : 'N' ); - - rc = fm_set_fault ( &alarm , NULL ); - if ( rc != FM_ERR_OK ) +#ifdef WANT_FIT_TESTING + if (( daemon_is_file_present ( MTC_CMD_FIT__FM_ERROR_CODE )) && + ( daemon_want_fit ( FIT_CODE__FM_SET_ALARM, hostname ))) { - wlog ("%s fm_set_fault call failed for alarm %s (rc:%d) ; retrying\n", hostname.c_str(), alarm.alarm_id, rc); - usleep (100000); /* sleep 100 msec */ - rc = fm_set_fault ( &alarm , NULL ); - if ( rc != FM_ERR_OK ) - { - elog ("%s failed to set alarm %s (rc:%d) ; giving up\n", hostname.c_str(), alarm.alarm_id, rc); - rc = FAIL ; - } + rc = daemon_get_file_int(MTC_CMD_FIT__FM_ERROR_CODE) ; } + else +#endif + { + rc = fm_set_fault ( &alarm , NULL ); + } + if ( rc == FM_ERR_OK ) + { + ilog ( "%s %s %s alarm raised (%s)", + hostname.c_str(), + alarm.alarm_id, + alarm.entity_instance_id, + alarmUtil_getSev_str(alarm.severity).c_str()); + } + /* error cases are handled/logged in the caller's ; dequeue API */ } else { @@ -313,19 +312,26 @@ int alarmUtil ( string & hostname, alog ( "fm_clear_fault: %s %s:%s", hostname.c_str(), alarm.entity_instance_id, alarm.alarm_id ); - ilog ("%s clearing %s %s alarm\n", hostname.c_str(), alarm.alarm_id, alarm.entity_instance_id); - if ( ( rc = fm_clear_fault ( &filter )) != FM_ERR_OK ) +#ifdef WANT_FIT_TESTING + if (( daemon_is_file_present ( MTC_CMD_FIT__FM_ERROR_CODE )) && + ( daemon_want_fit ( FIT_CODE__FM_CLR_ALARM, hostname ))) { - if ( rc != FM_ERR_ENTITY_NOT_FOUND ) - { - elog ("%s failed to fm_clear_fault (rc:%d)\n", hostname.c_str(), rc ); - rc = FAIL ; - } - else - { - rc = PASS ; - } + rc = daemon_get_file_int(MTC_CMD_FIT__FM_ERROR_CODE) ; } + else +#endif + { + rc = fm_clear_fault ( &filter ); + } + + if ( rc == FM_ERR_OK ) + { + ilog ("%s %s %s alarm cleared\n", + hostname.c_str(), + alarm.alarm_id, + alarm.entity_instance_id); + } + /* error cases are handled/logged in the caller's ; dequeue API */ } return (rc); @@ -348,7 +354,7 @@ int alarmUtil_clear ( string hostname, string alarm_id , string entity ) } /** Assert a specified hosts's alarm with a CRITICAL severity level */ -int alarmUtil_critical ( string hostname, string alarm_id , string entity ) +int alarmUtil_critical ( string hostname, string alarm_id , string entity, FMTimeT & timestamp ) { alarmUtil_type * alarm_ptr = alarmData_getAlarm_ptr(alarm_id); if ( alarm_ptr ) @@ -358,6 +364,7 @@ int alarmUtil_critical ( string hostname, string alarm_id , string entity ) alarm_ptr->alarm.severity = FM_ALARM_SEVERITY_CRITICAL ; alarm_ptr->alarm.alarm_state = FM_ALARM_STATE_SET ; + if ( timestamp ) alarm_ptr->alarm.timestamp = timestamp ; snprintf ( alarm_ptr->alarm.reason_text, FM_MAX_BUFFER_LENGTH, "%s %s", hostname.data(), alarm_ptr->critl_reason.data()); @@ -368,7 +375,7 @@ int alarmUtil_critical ( string hostname, string alarm_id , string entity ) /** Assert a specified host's alarm with a MAJOR severity level */ -int alarmUtil_major ( string hostname, string alarm_id , string entity ) +int alarmUtil_major ( string hostname, string alarm_id , string entity, FMTimeT & timestamp ) { alarmUtil_type * alarm_ptr = alarmData_getAlarm_ptr(alarm_id); if ( alarm_ptr ) @@ -378,6 +385,7 @@ int alarmUtil_major ( string hostname, string alarm_id , string entity ) alarm_ptr->alarm.severity = FM_ALARM_SEVERITY_MAJOR ; alarm_ptr->alarm.alarm_state = FM_ALARM_STATE_SET ; + if ( timestamp ) alarm_ptr->alarm.timestamp = timestamp ; snprintf ( alarm_ptr->alarm.reason_text, FM_MAX_BUFFER_LENGTH, "%s %s", hostname.data(), alarm_ptr->major_reason.data()); @@ -385,8 +393,9 @@ int alarmUtil_major ( string hostname, string alarm_id , string entity ) } return (FAIL_NULL_POINTER); } + /** Assert a specified host's alarm with a MINOR severity level */ -int alarmUtil_minor ( string hostname, string alarm_id , string entity ) +int alarmUtil_minor ( string hostname, string alarm_id , string entity, FMTimeT & timestamp ) { alarmUtil_type * alarm_ptr = alarmData_getAlarm_ptr(alarm_id); if ( alarm_ptr ) @@ -396,6 +405,7 @@ int alarmUtil_minor ( string hostname, string alarm_id , string entity ) alarm_ptr->alarm.severity = FM_ALARM_SEVERITY_MINOR ; alarm_ptr->alarm.alarm_state = FM_ALARM_STATE_SET ; + if ( timestamp ) alarm_ptr->alarm.timestamp = timestamp ; snprintf ( alarm_ptr->alarm.reason_text, FM_MAX_BUFFER_LENGTH, "%s %s", hostname.data(), alarm_ptr->minor_reason.data()); @@ -405,7 +415,7 @@ int alarmUtil_minor ( string hostname, string alarm_id , string entity ) } /** Assert a specified host's alarm with a WARNING severity level */ -int alarmUtil_warning ( string hostname, string alarm_id , string entity ) +int alarmUtil_warning ( string hostname, string alarm_id , string entity, FMTimeT & timestamp ) { alarmUtil_type * alarm_ptr = alarmData_getAlarm_ptr(alarm_id); if ( alarm_ptr ) @@ -415,6 +425,7 @@ int alarmUtil_warning ( string hostname, string alarm_id , string entity alarm_ptr->alarm.severity = FM_ALARM_SEVERITY_WARNING ; alarm_ptr->alarm.alarm_state = FM_ALARM_STATE_SET ; + if ( timestamp ) alarm_ptr->alarm.timestamp = timestamp ; snprintf ( alarm_ptr->alarm.reason_text, FM_MAX_BUFFER_LENGTH, "%s %s", hostname.data(), alarm_ptr->minor_reason.data()); @@ -424,7 +435,7 @@ int alarmUtil_warning ( string hostname, string alarm_id , string entity } /** Create CRITICAL log */ -int alarmUtil_critical_log ( string hostname, string alarm_id , string entity ) +int alarmUtil_critical_log ( string hostname, string alarm_id , string entity, FMTimeT & timestamp ) { alarmUtil_type * alarm_ptr = alarmData_getAlarm_ptr(alarm_id); if ( alarm_ptr ) @@ -434,6 +445,7 @@ int alarmUtil_critical_log ( string hostname, string alarm_id , string entity ) alarm_ptr->alarm.severity = FM_ALARM_SEVERITY_CRITICAL ; alarm_ptr->alarm.alarm_state = FM_ALARM_STATE_MSG ; + if ( timestamp ) alarm_ptr->alarm.timestamp = timestamp ; snprintf ( alarm_ptr->alarm.reason_text, FM_MAX_BUFFER_LENGTH, "%s %s", hostname.data(), alarm_ptr->critl_reason.data()); @@ -444,7 +456,7 @@ int alarmUtil_critical_log ( string hostname, string alarm_id , string entity ) /** Create MAJOR log */ -int alarmUtil_major_log ( string hostname, string alarm_id , string entity ) +int alarmUtil_major_log ( string hostname, string alarm_id , string entity, FMTimeT & timestamp ) { alarmUtil_type * alarm_ptr = alarmData_getAlarm_ptr(alarm_id); if ( alarm_ptr ) @@ -454,6 +466,7 @@ int alarmUtil_major_log ( string hostname, string alarm_id , string entity ) alarm_ptr->alarm.severity = FM_ALARM_SEVERITY_MAJOR ; alarm_ptr->alarm.alarm_state = FM_ALARM_STATE_MSG ; + if ( timestamp ) alarm_ptr->alarm.timestamp = timestamp ; snprintf ( alarm_ptr->alarm.reason_text, FM_MAX_BUFFER_LENGTH, "%s %s", hostname.data(), alarm_ptr->major_reason.data()); @@ -462,7 +475,7 @@ int alarmUtil_major_log ( string hostname, string alarm_id , string entity ) return (FAIL_NULL_POINTER); } /** Create MINOR log */ -int alarmUtil_minor_log ( string hostname, string alarm_id , string entity ) +int alarmUtil_minor_log ( string hostname, string alarm_id , string entity, FMTimeT & timestamp ) { alarmUtil_type * alarm_ptr = alarmData_getAlarm_ptr(alarm_id); if ( alarm_ptr ) @@ -472,6 +485,7 @@ int alarmUtil_minor_log ( string hostname, string alarm_id , string entit alarm_ptr->alarm.severity = FM_ALARM_SEVERITY_MINOR ; alarm_ptr->alarm.alarm_state = FM_ALARM_STATE_MSG ; + if ( timestamp ) alarm_ptr->alarm.timestamp = timestamp ; snprintf ( alarm_ptr->alarm.reason_text, FM_MAX_BUFFER_LENGTH, "%s %s", hostname.data(), alarm_ptr->minor_reason.data()); @@ -481,7 +495,7 @@ int alarmUtil_minor_log ( string hostname, string alarm_id , string entit } /** Create WARNING log */ -int alarmUtil_warning_log ( string hostname, string alarm_id, string entity, string prefix ) +int alarmUtil_warning_log ( string hostname, string alarm_id, string entity, string prefix, FMTimeT & timestamp ) { alarmUtil_type * alarm_ptr = alarmData_getAlarm_ptr(alarm_id); if ( alarm_ptr ) @@ -491,6 +505,7 @@ int alarmUtil_warning_log ( string hostname, string alarm_id, string entity, str alarm_ptr->alarm.severity = FM_ALARM_SEVERITY_WARNING ; alarm_ptr->alarm.alarm_state = FM_ALARM_STATE_MSG ; + if ( timestamp ) alarm_ptr->alarm.timestamp = timestamp ; snprintf ( alarm_ptr->alarm.reason_text, FM_MAX_BUFFER_LENGTH, "%s %s", hostname.data(), entity.data()); diff --git a/mtce/src/common/nodeClass.h b/mtce/src/common/nodeClass.h index 42198bad..b20fbe6f 100755 --- a/mtce/src/common/nodeClass.h +++ b/mtce/src/common/nodeClass.h @@ -650,6 +650,7 @@ private: /** @} private_monitoring_services_variables */ /* List of alarms and current severity */ + #define MAX_ALARMS (10) EFmAlarmSeverityT alarms[MAX_ALARMS]; /* tracks whether the alarms for this host have been loaded already or not */