metal/mtce/src/maintenance/mtcAlarm.cpp

1097 lines
42 KiB
C++

/*
* Copyright (c) 2015-2017, 2024 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
*/
/**
* @file
* Wind River Titanium Cloud 'Maintenance Agent' Alarm Module
*/
#include <sys/types.h>
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
using namespace std;
#ifdef __AREA__
#undef __AREA__
#endif
#define __AREA__ "alm"
#include "daemon_common.h" /* */
#include "nodeBase.h" /* */
#include "nodeClass.h" /* */
#include "nodeTimers.h" /* */
#include "nodeUtil.h" /* */
#include "mtcAlarm.h" /* for ... this module header */
#include "hbsAlarm.h" /* for ... hbsAlarm stubs */
alarmUtil_type alarm_list[MTC_ALARM_ID__LAST] ;
void mtcAlarm_init ( void )
{
alarmUtil_type * ptr ;
/** Lock Alarm ************************************************************/
ptr = &alarm_list[MTC_ALARM_ID__LOCK];
memset (&ptr->alarm, 0, (sizeof(SFmAlarmDataT)));
snprintf(&ptr->alarm.alarm_id[0], FM_MAX_BUFFER_LENGTH, "%s", LOCK_ALARM_ID);
ptr->name = "Lock" ;
ptr->instc_prefix = "" ;
ptr->critl_reason =
ptr->major_reason =
ptr->minor_reason = "was administratively locked to take it out-of-service.";
ptr->clear_reason = "was administratively unlocked and is back in-service.";
ptr->alarm.alarm_type = FM_ALARM_OPERATIONAL;
ptr->alarm.probable_cause = FM_ALARM_OUT_OF_SERVICE ;
ptr->alarm.inhibit_alarms = FM_TRUE ;
ptr->alarm.service_affecting = FM_TRUE ;
ptr->alarm.suppression = FM_FALSE;
ptr->alarm.severity = FM_ALARM_SEVERITY_CLEAR ; /* Dynamic */
ptr->alarm.alarm_state = FM_ALARM_STATE_CLEAR ; /* Dynamic */
snprintf( ptr->alarm.proposed_repair_action, FM_MAX_BUFFER_LENGTH,
"Administratively unlock Host to bring it back in-service.");
/** pxeboot mtcAlive Alarm **************************************************/
ptr = &alarm_list[MTC_ALARM_ID__MTCALIVE];
memset (&ptr->alarm, 0, (sizeof(SFmAlarmDataT)));
snprintf(&ptr->alarm.alarm_id[0], FM_MAX_BUFFER_LENGTH, "%s", LOCK_ALARM_ID);
ptr->name = "mtcAlive" ;
ptr->instc_prefix = "" ;
ptr->critl_reason =
ptr->major_reason =
ptr->minor_reason = "pxeboot network communication failure";
ptr->clear_reason = "pxeboot network communication recovered";
ptr->alarm.alarm_type = FM_ALARM_COMM ;
ptr->alarm.probable_cause = FM_ALARM_CAUSE_UNKNOWN;
ptr->alarm.inhibit_alarms = FM_FALSE;
ptr->alarm.service_affecting = FM_FALSE;
ptr->alarm.suppression = FM_TRUE ;
ptr->alarm.severity = FM_ALARM_SEVERITY_CLEAR ; /* Dynamic */
ptr->alarm.alarm_state = FM_ALARM_STATE_CLEAR ; /* Dynamic */
snprintf( ptr->alarm.proposed_repair_action, FM_MAX_BUFFER_LENGTH,
"Administratively Lock and Unlock host to recover. If problem persists, contact next level of support.");
/** Enable Alarm ************************************************************/
ptr = &alarm_list[MTC_ALARM_ID__ENABLE];
memset (&ptr->alarm, 0, (sizeof(SFmAlarmDataT)));
snprintf(&ptr->alarm.alarm_id[0], FM_MAX_BUFFER_LENGTH, "%s", ENABLE_ALARM_ID);
ptr->name = "In-Service" ;
ptr->instc_prefix = "" ;
/* this is for a log */
ptr->minor_reason = "has experienced a minor In-Service test event. "
"No action is required. " ;
/* this is for an alarm and degrade */
ptr->major_reason = "Host Services failed to start.";
ptr->critl_reason = "experienced a service-affecting failure. "
"Auto-recovery in progress. "
"Manual Lock and Unlock may be required if auto-recovery is unsuccessful.";
ptr->clear_reason = "was auto recovered through Reboot and is now in-service if 'unlocked-enabled' "
"or is otherwise 'locked-disabled' by administrative 'lock' action.";
ptr->alarm.alarm_type = FM_ALARM_OPERATIONAL;
ptr->alarm.probable_cause = FM_ALARM_APP_SUBSYS_FAILURE ;
ptr->alarm.inhibit_alarms = FM_FALSE ;
ptr->alarm.service_affecting = FM_TRUE ;
ptr->alarm.suppression = FM_TRUE ;
ptr->alarm.severity = FM_ALARM_SEVERITY_CLEAR ; /* Dynamic */
ptr->alarm.alarm_state = FM_ALARM_STATE_CLEAR ; /* Dynamic */
snprintf (ptr->alarm.proposed_repair_action, FM_MAX_BUFFER_LENGTH,
"If auto-recovery is consistently unable to recover host to the unlocked-enabled "
"state contact next level of support or lock and replace failing Host.");
/** Configuration Alarm ************************************************************/
ptr = &alarm_list[MTC_ALARM_ID__CONFIG];
memset (&ptr->alarm, 0, (sizeof(SFmAlarmDataT)));
snprintf(&ptr->alarm.alarm_id[0], FM_MAX_BUFFER_LENGTH, "%s", CONFIG_ALARM_ID);
ptr->name = "Configuration" ;
ptr->instc_prefix = "" ;
ptr->critl_reason =
ptr->major_reason =
ptr->minor_reason = "experienced a configuration failure. ";
ptr->clear_reason = "has been successfully configured and is now in-service if 'unlocked-enabled' "
"or is otherwise 'locked-disabled' by administrative 'lock' action.";
ptr->alarm.alarm_type = FM_ALARM_OPERATIONAL;
ptr->alarm.probable_cause = FM_ALARM_CONFIG_ERROR ;
ptr->alarm.inhibit_alarms = FM_FALSE;
ptr->alarm.service_affecting = FM_TRUE ;
ptr->alarm.suppression = FM_TRUE ;
ptr->alarm.severity = FM_ALARM_SEVERITY_CLEAR ; /* Dynamic */
ptr->alarm.alarm_state = FM_ALARM_STATE_CLEAR ; /* Dynamic */
snprintf (ptr->alarm.proposed_repair_action, FM_MAX_BUFFER_LENGTH,
"If manual or auto-recovery is consistently unable to recover host to the unlocked-enabled "
"state contact next level of support or lock and replace failing Host.");
/** Init Board Management Controller Access Alarm Entry ******************/
ptr = &alarm_list[MTC_ALARM_ID__BM];
memset (&ptr->alarm, 0, (sizeof(SFmAlarmDataT)));
snprintf(&ptr->alarm.alarm_id[0], FM_MAX_BUFFER_LENGTH, "%s", BM_ALARM_ID);
ptr->name = "Board Management Controller Access" ;
ptr->instc_prefix = "" ;
ptr->critl_reason = "board management controller is unresponsive." ;
ptr->major_reason = "board management controller is unresponsive." ;
ptr->minor_reason = "access to board management module has failed." ;
ptr->clear_reason = "access to board management module is established" ;
ptr->alarm.alarm_type = FM_ALARM_OPERATIONAL ;
ptr->alarm.probable_cause = FM_ALARM_COMM_SUBSYS_FAILURE ;
ptr->alarm.inhibit_alarms = FM_FALSE;
ptr->alarm.service_affecting = FM_FALSE;
ptr->alarm.suppression = FM_FALSE;
ptr->alarm.severity = FM_ALARM_SEVERITY_CLEAR ; /* Dynamic */
ptr->alarm.alarm_state = FM_ALARM_STATE_CLEAR ; /* Dynamic */
snprintf( ptr->alarm.proposed_repair_action, FM_MAX_BUFFER_LENGTH,
"Check Host's board management config and connectivity.");
/** Init Compute Failure Alarm Entry *************************************/
ptr = &alarm_list[MTC_ALARM_ID__CH_COMP];
memset (&ptr->alarm, 0, (sizeof(SFmAlarmDataT)));
snprintf(&ptr->alarm.alarm_id[0], FM_MAX_BUFFER_LENGTH, "%s", CH_COMP_ALARM_ID);
ptr->name = "Compute Function" ;
ptr->instc_prefix = "" ;
ptr->minor_reason =
ptr->major_reason = "Compute service is not fully operational. Auto recovery in progress." ;
ptr->critl_reason = "Compute service of the only available controller is not operational. "
"Auto-recovery disabled. Degrading host instead.";
ptr->clear_reason = "compute service has recovered";
ptr->alarm.alarm_type = FM_ALARM_OPERATIONAL;
ptr->alarm.probable_cause = FM_ALARM_APP_SUBSYS_FAILURE ;
ptr->alarm.inhibit_alarms = FM_FALSE ;
ptr->alarm.service_affecting = FM_TRUE ;
ptr->alarm.suppression = FM_TRUE ;
ptr->alarm.severity = FM_ALARM_SEVERITY_CLEAR ; /* Dynamic */
ptr->alarm.alarm_state = FM_ALARM_STATE_CLEAR ; /* Dynamic */
snprintf (ptr->alarm.proposed_repair_action, FM_MAX_BUFFER_LENGTH,
"If alarm is against the only active controller then Enable second controller "
"and Switch Activity (Swact) to it as soon as possible. If the alarm "
"persists then Lock/Unlock host to recover its local compute service.");
/** LUKS volume config failure Alarm Entry *************************************/
ptr = &alarm_list[MTC_ALARM_ID__LUKS];
memset (&ptr->alarm, 0, (sizeof(SFmAlarmDataT)));
snprintf(&ptr->alarm.alarm_id[0], FM_MAX_BUFFER_LENGTH, "%s", LUKS_ALARM_ID);
ptr->name = "LUKS volume failure" ;
ptr->instc_prefix = "" ;
ptr->minor_reason =
ptr->major_reason =
ptr->critl_reason = "LUKS volume is not active or functioning properly.";
ptr->clear_reason = "'LUKS volume' has been successfully unsealed and service is functioning properly.";
ptr->alarm.alarm_type = FM_ALARM_OPERATIONAL;
ptr->alarm.probable_cause = FM_ALARM_APP_SUBSYS_FAILURE ;
ptr->alarm.inhibit_alarms = FM_FALSE ;
ptr->alarm.service_affecting = FM_FALSE ;
ptr->alarm.suppression = FM_TRUE ;
ptr->alarm.severity = FM_ALARM_SEVERITY_CLEAR ; /* Dynamic */
ptr->alarm.alarm_state = FM_ALARM_STATE_CLEAR ; /* Dynamic */
snprintf (ptr->alarm.proposed_repair_action, FM_MAX_BUFFER_LENGTH,
"If this alarm does not automatically clear after some time and "
"continues to be asserted after Host is locked and unlocked then "
"contact next level of support for root cause analysis and recovery.");
/** Init Event Log Entry *************************************************/
ptr = &alarm_list[MTC_LOG_ID__EVENT];
memset (&ptr->alarm, 0, (sizeof(SFmAlarmDataT)));
snprintf(&ptr->alarm.alarm_id[0], FM_MAX_BUFFER_LENGTH, "%s", EVENT_LOG_ID);
ptr->name = "Maintenance Event" ;
ptr->minor_reason =
ptr->major_reason =
ptr->critl_reason =
ptr->clear_reason = "";
ptr->alarm.alarm_type = FM_ALARM_TYPE_UNKNOWN ;
ptr->alarm.probable_cause = FM_ALARM_CAUSE_UNKNOWN ;
ptr->alarm.inhibit_alarms = FM_FALSE ;
ptr->alarm.service_affecting = FM_FALSE ;
ptr->alarm.suppression = FM_FALSE ;
ptr->alarm.severity = FM_ALARM_SEVERITY_CLEAR ; /* Dynamic */
ptr->alarm.alarm_state = FM_ALARM_STATE_MSG ; /* Dynamic */
snprintf ( ptr->alarm.proposed_repair_action, FM_MAX_BUFFER_LENGTH, "%s", "");
/** Init Command Log Entry ***********************************************/
ptr = &alarm_list[MTC_LOG_ID__COMMAND];
memset (&ptr->alarm, 0, (sizeof(SFmAlarmDataT)));
snprintf(&ptr->alarm.alarm_id[0], FM_MAX_BUFFER_LENGTH, "%s", COMMAND_LOG_ID);
ptr->name = "Maintenance Command" ;
ptr->minor_reason =
ptr->major_reason =
ptr->critl_reason =
ptr->clear_reason = "";
ptr->alarm.alarm_type = FM_ALARM_TYPE_UNKNOWN ;
ptr->alarm.probable_cause = FM_ALARM_CAUSE_UNKNOWN ;
ptr->alarm.inhibit_alarms = FM_FALSE ;
ptr->alarm.service_affecting = FM_FALSE ;
ptr->alarm.suppression = FM_FALSE ;
ptr->alarm.severity = FM_ALARM_SEVERITY_CLEAR ; /* Dynamic */
ptr->alarm.alarm_state = FM_ALARM_STATE_MSG ; /* Dynamic */
snprintf ( ptr->alarm.proposed_repair_action, FM_MAX_BUFFER_LENGTH, "%s", "");
/** Init Config Log Entry ***********************************************/
ptr = &alarm_list[MTC_LOG_ID__CONFIG];
memset (&ptr->alarm, 0, (sizeof(SFmAlarmDataT)));
snprintf(&ptr->alarm.alarm_id[0], FM_MAX_BUFFER_LENGTH, "%s", CONFIG_LOG_ID);
ptr->name = "Maintenance Config" ;
ptr->minor_reason =
ptr->major_reason =
ptr->critl_reason =
ptr->clear_reason = "";
ptr->alarm.alarm_type = FM_ALARM_TYPE_UNKNOWN ;
ptr->alarm.probable_cause = FM_ALARM_CAUSE_UNKNOWN ;
ptr->alarm.inhibit_alarms = FM_FALSE ;
ptr->alarm.service_affecting = FM_FALSE ;
ptr->alarm.suppression = FM_FALSE ;
ptr->alarm.severity = FM_ALARM_SEVERITY_CLEAR ; /* Dynamic */
ptr->alarm.alarm_state = FM_ALARM_STATE_MSG ; /* Dynamic */
snprintf ( ptr->alarm.proposed_repair_action, FM_MAX_BUFFER_LENGTH, "%s", "");
/** Init State Change Log Entry ******************************************/
ptr = &alarm_list[MTC_LOG_ID__STATECHANGE];
memset (&ptr->alarm, 0, (sizeof(SFmAlarmDataT)));
snprintf(&ptr->alarm.alarm_id[0], FM_MAX_BUFFER_LENGTH, "%s", STATECHANGE_LOG_ID);
ptr->name = "Maintenance State Change" ;
ptr->minor_reason =
ptr->major_reason =
ptr->critl_reason =
ptr->clear_reason = "";
ptr->alarm.alarm_type = FM_ALARM_TYPE_UNKNOWN ;
ptr->alarm.probable_cause = FM_ALARM_CAUSE_UNKNOWN ;
ptr->alarm.inhibit_alarms = FM_FALSE ;
ptr->alarm.service_affecting = FM_FALSE ;
ptr->alarm.suppression = FM_FALSE ;
ptr->alarm.severity = FM_ALARM_SEVERITY_CLEAR ; /* Dynamic */
ptr->alarm.alarm_state = FM_ALARM_STATE_MSG ; /* Dynamic */
snprintf ( ptr->alarm.proposed_repair_action, FM_MAX_BUFFER_LENGTH, "%s", "");
/** Init Service Status Log Entry ****************************************/
ptr = &alarm_list[MTC_LOG_ID__SERVICESTATUS];
memset (&ptr->alarm, 0, (sizeof(SFmAlarmDataT)));
snprintf(&ptr->alarm.alarm_id[0], FM_MAX_BUFFER_LENGTH, "%s", SERVICESTATUS_LOG_ID);
ptr->name = "Maintenance Service Status Change" ;
ptr->minor_reason =
ptr->major_reason =
ptr->critl_reason =
ptr->clear_reason = "";
ptr->alarm.alarm_type = FM_ALARM_TYPE_UNKNOWN ;
ptr->alarm.probable_cause = FM_ALARM_CAUSE_UNKNOWN ;
ptr->alarm.inhibit_alarms = FM_FALSE ;
ptr->alarm.service_affecting = FM_FALSE ;
ptr->alarm.suppression = FM_FALSE ;
ptr->alarm.severity = FM_ALARM_SEVERITY_CLEAR ; /* Dynamic */
ptr->alarm.alarm_state = FM_ALARM_STATE_MSG ; /* Dynamic */
snprintf ( ptr->alarm.proposed_repair_action, FM_MAX_BUFFER_LENGTH, "%s", "");
}
string _getIdentity ( mtc_alarm_id_enum id )
{
switch ( id )
{
case MTC_ALARM_ID__LOCK: return (LOCK_ALARM_ID);
case MTC_ALARM_ID__MTCALIVE: return (MTCALIVE_ALARM_ID);
case MTC_ALARM_ID__CONFIG: return (CONFIG_ALARM_ID);
case MTC_ALARM_ID__ENABLE: return (ENABLE_ALARM_ID);
case MTC_ALARM_ID__BM: return (BM_ALARM_ID);
case MTC_ALARM_ID__CH_COMP: return (CH_COMP_ALARM_ID);
case MTC_ALARM_ID__LUKS: return (LUKS_ALARM_ID);
case MTC_LOG_ID__EVENT: return (EVENT_LOG_ID);
case MTC_LOG_ID__COMMAND: return (COMMAND_LOG_ID);
case MTC_LOG_ID__STATECHANGE: return (STATECHANGE_LOG_ID);
case MTC_LOG_ID__CONFIG: return (CONFIG_LOG_ID);
default: return (SWERR_ALARM_ID);
}
}
string mtcAlarm_getId_str ( mtc_alarm_id_enum id )
{
return(_getIdentity(id));
}
string _getInstance ( mtc_alarm_id_enum id )
{
id = id ;
return ("");
}
EFmAlarmSeverityT mtcAlarm_state ( string hostname, mtc_alarm_id_enum id )
{
string identity = _getIdentity(id) ;
string instance = _getInstance(id) ;
return ( alarmUtil_query ( hostname, identity, instance));
}
void mtcAlarm_clear_all ( string hostname )
{
for ( int i = 0 ; i < MTC_ALARM_ID__LAST ; ++i )
{
mtcAlarm_clear ( hostname, (mtc_alarm_id_enum)i );
}
}
/****************************************************************************
*
* Name : mtcAlarm_audit
*
* Purpose : Monitor and Auto-Correct maintenance alarms
*
* Description: Query locked state alarm (raw)
* if successful
* - Query alarms
* - compare to running state
* - correct mismatches ; internal state takes precidence
* - log all alarm state changes
*
****************************************************************************/
void nodeLinkClass::mtcAlarm_audit ( struct nodeLinkClass::node * node_ptr )
{
/*
* Read locked state alarm directly to detect fm access failures.
* If successful further reads are done using a wrapper utility.
*/
SFmAlarmDataT alarm_query ;
AlarmFilter alarm_filter ;
EFmErrorT rc ;
memset(&alarm_query, 0, sizeof(alarm_query));
memset(&alarm_filter, 0, sizeof(alarm_filter));
snprintf ( &alarm_filter.alarm_id[0], FM_MAX_BUFFER_LENGTH, "%s",
LOCK_ALARM_ID);
snprintf ( &alarm_filter.entity_instance_id[0], FM_MAX_BUFFER_LENGTH, "%s%s",
ENTITY_PREFIX, node_ptr->hostname.data());
rc = fm_get_fault ( &alarm_filter, &alarm_query );
if (( rc != FM_ERR_OK ) && ( rc != FM_ERR_ENTITY_NOT_FOUND ))
{
wlog("%s alarm query failure ; code:%d",
node_ptr->hostname.c_str(),
rc );
return ;
}
/* With FM comms proven working lets check the other mtc alarms */
string active_alarms = "";
for ( int i = 0 ; i < MAX_ALARMS ; i++ )
{
mtc_alarm_id_enum id = (mtc_alarm_id_enum)i ;
if ( id == MTC_ALARM_ID__LOCK )
{
/* Unexpected severity case */
if ( node_ptr->adminState == MTC_ADMIN_STATE__LOCKED )
{
if ( alarm_query.severity != FM_ALARM_SEVERITY_WARNING )
{
node_ptr->alarms[id] = FM_ALARM_SEVERITY_WARNING ;
wlog("%s %s alarm mismatch ; %s -> %s",
node_ptr->hostname.c_str(),
_getIdentity(id).c_str(),
alarmUtil_getSev_str(alarm_query.severity).c_str(),
alarmUtil_getSev_str(node_ptr->alarms[id]).c_str());
mtcAlarm_warning ( node_ptr->hostname, MTC_ALARM_ID__LOCK );
}
if (!active_alarms.empty())
active_alarms.append(", ");
active_alarms.append(_getIdentity(id) + ":");
active_alarms.append(alarmUtil_getSev_str(node_ptr->alarms[id]));
}
/* Unexpected assertion case */
else if (( node_ptr->adminState == MTC_ADMIN_STATE__UNLOCKED ) &&
( alarm_query.severity != FM_ALARM_SEVERITY_CLEAR ))
{
node_ptr->alarms[id] = FM_ALARM_SEVERITY_CLEAR ;
wlog("%s %s alarm mismatch ; %s -> %s",
node_ptr->hostname.c_str(),
_getIdentity(id).c_str(),
alarmUtil_getSev_str(alarm_query.severity).c_str(),
alarmUtil_getSev_str(node_ptr->alarms[id]).c_str());
mtcAlarm_clear ( node_ptr->hostname, id );
}
}
else if (( id == MTC_ALARM_ID__CONFIG ) ||
( id == MTC_ALARM_ID__ENABLE ) ||
( id == MTC_ALARM_ID__BM ) ||
( id == MTC_ALARM_ID__CH_COMP))
{
EFmAlarmSeverityT severity = mtcAlarm_state ( node_ptr->hostname, id);
if ( severity != node_ptr->alarms[id] )
{
ilog ("%s %s alarm mismatch ; %s -> %s",
node_ptr->hostname.c_str(),
_getIdentity(id).c_str(),
alarmUtil_getSev_str(severity).c_str(),
alarmUtil_getSev_str(node_ptr->alarms[id]).c_str());
if ( node_ptr->alarms[id] == FM_ALARM_SEVERITY_CLEAR )
{
mtcAlarm_clear ( node_ptr->hostname, id );
}
else
{
mtcAlarm_raise ( node_ptr->hostname, id, node_ptr->alarms[id] );
}
}
if ( node_ptr->alarms[id] != FM_ALARM_SEVERITY_CLEAR )
{
if (!active_alarms.empty())
active_alarms.append(", ");
active_alarms.append(_getIdentity(id) + ":");
active_alarms.append(alarmUtil_getSev_str(node_ptr->alarms[id]));
}
}
/* else don't care about other alarm ids ; logs events etc */
}
/* manage logging of active alarms */
if ( !active_alarms.empty() )
{
if ( node_ptr->active_alarms != active_alarms )
{
ilog ("%s active alarms: %s",
node_ptr->hostname.c_str(),
active_alarms.c_str());
node_ptr->active_alarms = active_alarms ;
}
/* else
* do nothing because there are active alarms
* that have not changed since the last audit.
*/
}
else if ( ! node_ptr->active_alarms.empty() )
{
/* clear active alarm list since there 'were' active alarms
* but there are no longer active alarms */
node_ptr->active_alarms.clear();
ilog ("%s no active alarms", node_ptr->hostname.c_str());
}
/* else
* no active alarms ; don't log */
}
/************************* A L A R M I N G **************************/
/* Raise the specified maintenance alarm severity */
int mtcAlarm_raise ( string hostname, mtc_alarm_id_enum id, EFmAlarmSeverityT severity )
{
switch ( severity )
{
case FM_ALARM_SEVERITY_MINOR:
return (mtcAlarm_minor(hostname,id));
case FM_ALARM_SEVERITY_MAJOR:
return (mtcAlarm_major(hostname,id));
case FM_ALARM_SEVERITY_CRITICAL:
return (mtcAlarm_critical(hostname,id));
default:
return (FAIL_BAD_PARM);
}
}
/* Clear the specified hosts's maintenance alarm */
int mtcAlarm_clear ( string hostname, mtc_alarm_id_enum id )
{
if ( id < MTC_ALARM_ID__LAST )
{
string identity = _getIdentity(id);
string instance = _getInstance(id);
ilog ("%s clearing '%s' alarm (%s%s)\n",
hostname.c_str(),
alarm_list[id].name.c_str(),
identity.c_str(),
instance.c_str());
snprintf( alarm_list[id].alarm.reason_text, FM_MAX_BUFFER_LENGTH, "%s %s", hostname.data(), alarm_list[id].clear_reason.data());
return ( alarmUtil_clear ( hostname, identity, instance, alarm_list[id].alarm ));
}
return (FAIL_BAD_PARM);
}
/** Assert a specified hosts's mtce alarm with a CRITICAL severity level */
int mtcAlarm_critical ( string hostname, mtc_alarm_id_enum id )
{
if ( id < MTC_ALARM_ID__LAST )
{
string identity = _getIdentity(id);
string instance = _getInstance(id);
elog ("%s setting critical '%s' failure alarm (%s %s)\n",
hostname.c_str(),
alarm_list[id].name.c_str(),
identity.c_str(),
instance.c_str());
snprintf ( alarm_list[id].alarm.reason_text, FM_MAX_BUFFER_LENGTH, "%s %s", hostname.data(), alarm_list[id].critl_reason.data());
return ( alarmUtil_critical ( hostname, identity, instance, alarm_list[id].alarm ));
}
return (FAIL_BAD_PARM);
}
/** Assert a specified host's mtce alarm with a MAJOR severity level */
int mtcAlarm_major ( string hostname, mtc_alarm_id_enum id )
{
if ( id < MTC_ALARM_ID__LAST )
{
string identity = _getIdentity(id);
string instance = _getInstance(id);
wlog ("%s setting major '%s' failure alarm (%s %s)\n",
hostname.c_str(),
alarm_list[id].name.c_str(),
identity.c_str(),
instance.c_str());
if ( id == MTC_ALARM_ID__BM )
{
snprintf( alarm_list[id].alarm.proposed_repair_action, FM_MAX_BUFFER_LENGTH,
"board managment controller 'reset' or 'power-cycle' is recommended.");
}
else if ( id == MTC_ALARM_ID__ENABLE )
{
snprintf( alarm_list[id].alarm.proposed_repair_action, FM_MAX_BUFFER_LENGTH,
"If alarm persists, host may require lock/unlock to recover. See maintenance logs for more detail.");
}
snprintf ( alarm_list[id].alarm.reason_text, FM_MAX_BUFFER_LENGTH, "%s %s", hostname.data(), alarm_list[id].major_reason.data());
return ( alarmUtil_major ( hostname, identity, instance, alarm_list[id].alarm ));
}
return (FAIL_BAD_PARM);
}
/** Assert a specified host's mtce alarm with a MINOR severity level */
int mtcAlarm_minor ( string hostname, mtc_alarm_id_enum id )
{
if ( id < MTC_ALARM_ID__LAST )
{
string identity = _getIdentity(id);
string instance = _getInstance(id);
wlog ("%s setting minor '%s' failure alarm (%s %s)\n",
hostname.c_str(),
alarm_list[id].name.c_str(),
identity.c_str(),
instance.c_str());
snprintf ( alarm_list[id].alarm.reason_text, FM_MAX_BUFFER_LENGTH, "%s %s", hostname.data(), alarm_list[id].minor_reason.data());
return ( alarmUtil_minor ( hostname, identity, instance, alarm_list[id].alarm ));
}
return (FAIL_BAD_PARM);
}
/** Assert a specified host's mtce alarm with a WARNING severity level */
int mtcAlarm_warning ( string hostname, mtc_alarm_id_enum id )
{
if ( id < MTC_ALARM_ID__LAST )
{
string identity = _getIdentity(id);
string instance = _getInstance(id);
wlog ("%s setting warning '%s' alarm (%s %s)\n",
hostname.c_str(),
alarm_list[id].name.c_str(),
identity.c_str(),
instance.c_str());
if ( id == MTC_ALARM_ID__BM )
{
snprintf( alarm_list[id].alarm.proposed_repair_action, FM_MAX_BUFFER_LENGTH,
"Check Host's board management config and connectivity.");
}
snprintf ( alarm_list[id].alarm.reason_text, FM_MAX_BUFFER_LENGTH, "%s %s", hostname.data(), alarm_list[id].minor_reason.data());
return ( alarmUtil_warning ( hostname, identity, instance, alarm_list[id].alarm ));
}
return (FAIL_BAD_PARM);
}
/*************************** L O G G I N G **********************************/
/** Create a CRITICAL maintenance log */
int mtcAlarm_critical_log ( string hostname, mtc_alarm_id_enum id )
{
if ( id < MTC_ALARM_ID__LAST )
{
string identity = _getIdentity(id);
string instance = _getInstance(id);
elog ("%s creating critical '%s' log (%s %s)\n",
hostname.c_str(),
alarm_list[id].name.c_str(),
identity.c_str(),
instance.c_str());
snprintf ( alarm_list[id].alarm.reason_text, FM_MAX_BUFFER_LENGTH, "%s %s", hostname.data(), alarm_list[id].critl_reason.data());
return ( alarmUtil_critical_log ( hostname, identity, instance, alarm_list[id].alarm ));
}
return (FAIL_BAD_PARM);
}
/** Create a MAJOR maintenance log */
int mtcAlarm_major_log ( string hostname, mtc_alarm_id_enum id )
{
if ( id < MTC_ALARM_ID__LAST )
{
string identity = _getIdentity(id);
string instance = _getInstance(id);
wlog ("%s creating major '%s' log (%s %s)\n",
hostname.c_str(),
alarm_list[id].name.c_str(),
identity.c_str(),
instance.c_str());
snprintf ( alarm_list[id].alarm.reason_text, FM_MAX_BUFFER_LENGTH, "%s %s", hostname.data(), alarm_list[id].major_reason.data());
return ( alarmUtil_major_log ( hostname, identity, instance, alarm_list[id].alarm ));
}
return (FAIL_BAD_PARM);
}
/** Create a MINOR maintenance log */
int mtcAlarm_minor_log ( string hostname, mtc_alarm_id_enum id )
{
if ( id < MTC_ALARM_ID__LAST )
{
string identity = _getIdentity(id);
string instance = _getInstance(id);
wlog ("%s creating minor '%s' log (%s %s)\n",
hostname.c_str(),
alarm_list[id].name.c_str(),
identity.c_str(),
instance.c_str());
snprintf ( alarm_list[id].alarm.reason_text, FM_MAX_BUFFER_LENGTH, "%s", alarm_list[id].minor_reason.data());
return ( alarmUtil_minor_log ( hostname, identity, instance, alarm_list[id].alarm ));
}
return (FAIL_BAD_PARM);
}
/** Create a WARNING maintenance log */
int mtcAlarm_warning_log ( string hostname, mtc_alarm_id_enum id )
{
if ( id < MTC_ALARM_ID__LAST )
{
string identity = _getIdentity(id);
string instance = _getInstance(id);
wlog ("%s creating warning '%s' log (%s %s)\n",
hostname.c_str(),
alarm_list[id].name.c_str(),
identity.c_str(),
instance.c_str());
snprintf ( alarm_list[id].alarm.reason_text, FM_MAX_BUFFER_LENGTH, "%s", alarm_list[id].minor_reason.data());
return ( alarmUtil_warning_log ( hostname, identity, instance, alarm_list[id].alarm ));
}
return (FAIL_BAD_PARM);
}
/** Create a neutral customer log */
int mtcAlarm_log ( string hostname, mtc_alarm_id_enum id, string str )
{
if ( id < MTC_ALARM_ID__END )
{
/* default to command */
mtc_alarm_id_enum index = MTC_LOG_ID__COMMAND ;
bool found = false ;
if ( id == MTC_LOG_ID__EVENT_ADD )
{
index = MTC_LOG_ID__EVENT ;
alarm_list[index].instc_prefix = "event=add" ;
snprintf ( alarm_list[index].alarm.reason_text,
FM_MAX_BUFFER_LENGTH, "%s %s",
hostname.data(),
"has been 'added' to the system" );
found = true ;
}
else if ( id == MTC_LOG_ID__EVENT_MNFA_ENTER )
{
index = MTC_LOG_ID__EVENT ;
alarm_list[index].instc_prefix = "event=mnfa_enter" ;
snprintf ( alarm_list[index].alarm.reason_text,
FM_MAX_BUFFER_LENGTH, "%s %s",
hostname.data(),
"has 'entered' multi-node failure avoidance" );
found = true ;
}
else if ( id == MTC_LOG_ID__EVENT_MNFA_EXIT )
{
index = MTC_LOG_ID__EVENT ;
alarm_list[index].instc_prefix = "event=mnfa_exit" ;
snprintf ( alarm_list[index].alarm.reason_text,
FM_MAX_BUFFER_LENGTH, "%s %s",
hostname.data(),
"has 'exited' multi-node failure avoidance" );
found = true ;
}
else if ( id == MTC_LOG_ID__STATUSCHANGE_FAILED )
{
index = MTC_LOG_ID__STATECHANGE ;
alarm_list[index].instc_prefix = "status=failed" ;
snprintf ( alarm_list[index].alarm.reason_text,
FM_MAX_BUFFER_LENGTH, "%s %s",
hostname.data(),
"is 'disabled-failed' to the system" );
found = true ;
}
else if ( id == MTC_LOG_ID__STATUSCHANGE_ENABLED )
{
index = MTC_LOG_ID__STATECHANGE ;
alarm_list[index].instc_prefix = "state=enabled" ;
snprintf ( alarm_list[index].alarm.reason_text,
FM_MAX_BUFFER_LENGTH, "%s %s",
hostname.data(),
"is now 'enabled'" );
found = true ;
}
else if ( id == MTC_LOG_ID__STATUSCHANGE_DISABLED )
{
index = MTC_LOG_ID__STATECHANGE ;
alarm_list[index].instc_prefix = "state=disabled" ;
snprintf ( alarm_list[index].alarm.reason_text,
FM_MAX_BUFFER_LENGTH, "%s %s",
hostname.data(),
"is now 'disabled'" );
found = true ;
}
else if ( id == MTC_LOG_ID__STATUSCHANGE_OFFLINE )
{
index = MTC_LOG_ID__STATECHANGE ;
alarm_list[index].instc_prefix = "status=offline" ;
snprintf ( alarm_list[index].alarm.reason_text,
FM_MAX_BUFFER_LENGTH, "%s %s",
hostname.data(),
"is now 'offline'" );
found = true ;
}
else if ( id == MTC_LOG_ID__STATUSCHANGE_ONLINE )
{
index = MTC_LOG_ID__STATECHANGE ;
alarm_list[index].instc_prefix = "status=online" ;
snprintf ( alarm_list[index].alarm.reason_text,
FM_MAX_BUFFER_LENGTH, "%s %s",
hostname.data(),
"is now 'online'" );
found = true ;
}
else if ( id == MTC_LOG_ID__STATUSCHANGE_REINSTALL_FAILED )
{
index = MTC_LOG_ID__STATECHANGE ;
alarm_list[index].instc_prefix = "status=reinstall-failed" ;
snprintf ( alarm_list[index].alarm.reason_text,
FM_MAX_BUFFER_LENGTH, "%s %s",
hostname.data(),
"reinstall failed" );
found = true ;
}
else if ( id == MTC_LOG_ID__STATUSCHANGE_REINSTALL_COMPLETE )
{
index = MTC_LOG_ID__STATECHANGE ;
alarm_list[index].instc_prefix = "status=reinstall-complete" ;
snprintf ( alarm_list[index].alarm.reason_text,
FM_MAX_BUFFER_LENGTH, "%s %s",
hostname.data(),
"reinstall completed successfully" );
found = true ;
}
else if ( id == MTC_LOG_ID__COMMAND_UNLOCK )
{
alarm_list[index].instc_prefix = "command=unlock" ;
snprintf ( alarm_list[index].alarm.reason_text,
FM_MAX_BUFFER_LENGTH, "%s %s",
hostname.data(),
"manual 'unlock' request" );
found = true ;
}
else if ( id == MTC_LOG_ID__COMMAND_FORCE_LOCK )
{
alarm_list[index].instc_prefix = "command=force-lock" ;
snprintf ( alarm_list[index].alarm.reason_text,
FM_MAX_BUFFER_LENGTH, "%s %s",
hostname.data(),
"manual 'force-lock' request" );
found = true ;
}
else if ( id == MTC_LOG_ID__COMMAND_SWACT )
{
alarm_list[index].instc_prefix = "command=swact" ;
snprintf ( alarm_list[index].alarm.reason_text,
FM_MAX_BUFFER_LENGTH, "%s %s",
hostname.data(),
"manual 'controller switchover' request" );
found = true ;
}
else if ( id == MTC_LOG_ID__COMMAND_MANUAL_REBOOT )
{
alarm_list[index].instc_prefix = "command=reboot" ;
snprintf ( alarm_list[index].alarm.reason_text,
FM_MAX_BUFFER_LENGTH, "%s %s",
hostname.data(),
"manual 'reboot' request" );
found = true ;
}
else if ( id == MTC_LOG_ID__COMMAND_AUTO_REBOOT )
{
alarm_list[index].instc_prefix = "action=reboot" ;
snprintf ( alarm_list[index].alarm.reason_text,
FM_MAX_BUFFER_LENGTH, "%s %s",
hostname.data(),
"'reboot' action" );
found = true ;
}
else if ( id == MTC_LOG_ID__COMMAND_MANUAL_RESET )
{
alarm_list[index].instc_prefix = "command=reset" ;
snprintf ( alarm_list[index].alarm.reason_text,
FM_MAX_BUFFER_LENGTH, "%s %s",
hostname.data(),
"manual 'reset' request" );
found = true ;
}
else if ( id == MTC_LOG_ID__COMMAND_AUTO_RESET )
{
alarm_list[index].instc_prefix = "action=reset" ;
snprintf ( alarm_list[index].alarm.reason_text,
FM_MAX_BUFFER_LENGTH, "%s %s",
hostname.data(),
"'reset' action" );
found = true ;
}
else if ( id == MTC_LOG_ID__COMMAND_REINSTALL )
{
alarm_list[index].instc_prefix = "command=reinstall" ;
snprintf ( alarm_list[index].alarm.reason_text,
FM_MAX_BUFFER_LENGTH, "%s %s",
hostname.data(),
"manual 'reinstall' request" );
found = true ;
}
else if ( id == MTC_LOG_ID__COMMAND_MANUAL_POWER_ON )
{
alarm_list[index].instc_prefix = "command=power-on" ;
snprintf ( alarm_list[index].alarm.reason_text,
FM_MAX_BUFFER_LENGTH, "%s %s",
hostname.data(),
"manual 'power-on' request" );
found = true ;
}
else if ( id == MTC_LOG_ID__COMMAND_AUTO_POWER_ON )
{
alarm_list[index].instc_prefix = "action=power-on" ;
snprintf ( alarm_list[index].alarm.reason_text,
FM_MAX_BUFFER_LENGTH, "%s %s",
hostname.data(),
"'power-on' action" );
found = true ;
}
else if ( id == MTC_LOG_ID__COMMAND_MANUAL_POWER_OFF )
{
alarm_list[index].instc_prefix = "command=power-off" ;
snprintf ( alarm_list[index].alarm.reason_text,
FM_MAX_BUFFER_LENGTH, "%s %s",
hostname.data(),
"manual 'power-off' request" );
found = true ;
}
else if ( id == MTC_LOG_ID__COMMAND_AUTO_POWER_OFF )
{
alarm_list[index].instc_prefix = "action=power-off" ;
snprintf ( alarm_list[index].alarm.reason_text,
FM_MAX_BUFFER_LENGTH, "%s %s",
hostname.data(),
"'power-off' action" );
found = true ;
}
else if ( id == MTC_LOG_ID__COMMAND_DELETE )
{
alarm_list[index].instc_prefix = "command=delete" ;
snprintf ( alarm_list[index].alarm.reason_text,
FM_MAX_BUFFER_LENGTH, "%s %s",
hostname.data(),
"manual 'delete' request" );
found = true ;
}
else if ( id == MTC_LOG_ID__COMMAND_BM_PROVISIONED )
{
alarm_list[index].instc_prefix = "command=provision" ;
snprintf ( alarm_list[index].alarm.reason_text,
FM_MAX_BUFFER_LENGTH, "%s %s",
hostname.data(),
"board management controller has been 'provisioned'" );
found = true ;
}
else if ( id == MTC_LOG_ID__COMMAND_BM_DEPROVISIONED )
{
alarm_list[index].instc_prefix = "command=deprovision" ;
snprintf ( alarm_list[index].alarm.reason_text,
FM_MAX_BUFFER_LENGTH, "%s %s",
hostname.data(),
"board management controller has been 'de-provisioned'" );
found = true ;
}
else if ( id == MTC_LOG_ID__COMMAND_BM_REPROVISIONED )
{
alarm_list[index].instc_prefix = "command=reprovision" ;
snprintf ( alarm_list[index].alarm.reason_text,
FM_MAX_BUFFER_LENGTH, "%s %s",
hostname.data(),
"board management controller has been 're-provisioned'" );
found = true ;
}
else if (( id == MTC_LOG_ID__CONFIG_HB_ACTION_FAIL ) ||
( id == MTC_LOG_ID__CONFIG_HB_ACTION_DEGRADE ) ||
( id == MTC_LOG_ID__CONFIG_HB_ACTION_ALARM ) ||
( id == MTC_LOG_ID__CONFIG_HB_ACTION_NONE ))
{
alarm_list[index].instc_prefix = "config=heartbeat_failure_action" ;
snprintf ( alarm_list[index].alarm.reason_text,
FM_MAX_BUFFER_LENGTH, "%s %s %s",
hostname.data(),
"platform maintenance service parameter 'heartbeat failure action' changed from",
str.data());
found = true ;
}
else if ( id == MTC_LOG_ID__CONFIG_MNFA_TIMEOUT )
{
alarm_list[index].instc_prefix = "config=mnfa_timeout" ;
snprintf ( alarm_list[index].alarm.reason_text,
FM_MAX_BUFFER_LENGTH, "%s %s %s",
hostname.data(),
"platform maintenance service parameter 'mnfa_timeout' changed from",
str.data());
found = true ;
}
else if ( id == MTC_LOG_ID__CONFIG_MNFA_THRESHOLD )
{
alarm_list[index].instc_prefix = "config=mnfa_threshold" ;
snprintf ( alarm_list[index].alarm.reason_text,
FM_MAX_BUFFER_LENGTH, "%s %s %s",
hostname.data(),
"platform maintenance service parameter 'mnfa_threshold' changed from",
str.data());
found = true ;
}
if ( found == true )
{
int rc ;
string identity = _getIdentity(index);
string instance = _getInstance(index);
instance.append(alarm_list[index].instc_prefix);
/* Want to make this log a critical */
if ( id == MTC_LOG_ID__STATUSCHANGE_REINSTALL_FAILED )
{
alarm_list[index].alarm.severity = FM_ALARM_SEVERITY_CRITICAL ;
}
rc = alarmUtil_log ( hostname, identity, instance, alarm_list[index].alarm );
/* Revert the severity of the event log back to Clear ( shows up as N/A ) */
if ( id == MTC_LOG_ID__STATUSCHANGE_REINSTALL_FAILED )
{
alarm_list[MTC_LOG_ID__STATECHANGE].alarm.severity = FM_ALARM_SEVERITY_CLEAR ;
}
return (rc);
}
}
return (FAIL_BAD_PARM);
}