metal/mtce/src/maintenance/mtcNodeMnfa.cpp

442 lines
15 KiB
C++

/*
* Copyright (c) 2013-2016 Wind River Systems, Inc.
*
* SPDX-License-Identifier: Apache-2.0
*
*/
/**
* @file
* Wind River CGTS Platform Node Maintenance
* "Multi-Node-Failure Avoidance feature utility implementation"
*
*/
#include <sys/types.h>
#include <iostream>
#include <string.h>
#include <stdio.h>
#include <list>
#include <vector>
using namespace std;
#include "nodeBase.h"
#include "nodeClass.h"
#include "nodeTimers.h"
#include "mtcNodeHdlrs.h"
/* create a log of all the hosts that are in the mnfa pool */
void log_mnfa_pool ( std::list<string> & mnfa_awol_list )
{
std::list<string>::iterator mnfa_awol_ptr ;
string pool_list = "" ;
if ( mnfa_awol_list.size() )
{
for ( mnfa_awol_ptr = mnfa_awol_list.begin() ;
mnfa_awol_ptr != mnfa_awol_list.end() ;
mnfa_awol_ptr++ )
{
pool_list.append (" ");
pool_list.append (mnfa_awol_ptr->data());
}
ilog ("MNFA POOL:%s\n", pool_list.c_str());
}
}
/*****************************************************************************
*
* Name : add_host_to_awol_list
*
* Description: Add a hostname to the awol list if its not already in the list
*
* Returns : true if added
* false if not added because it is already in the list.
*
*****************************************************************************/
static bool add_host_to_awol_list ( string hostname, std::list<string> & mnfa_awol_list )
{
std::list<string>::iterator mnfa_awol_ptr ;
for ( mnfa_awol_ptr = mnfa_awol_list.begin() ;
mnfa_awol_ptr != mnfa_awol_list.end() ;
mnfa_awol_ptr++ )
{
if ( *(mnfa_awol_ptr) == hostname )
{
/* already in list */
return false ;
}
}
mnfa_awol_list.push_back(hostname);
return true ;
}
/*****************************************************************************
*
* Name : mnfa_add_host
*
* Description: Add a failed host the mnfa count and manage
* the failed list
*
*****************************************************************************/
void nodeLinkClass::mnfa_add_host ( struct nodeLinkClass::node * node_ptr , iface_enum iface )
{
if (( this->hbs_failure_action == HBS_FAILURE_ACTION__ALARM ) ||
( this->hbs_failure_action == HBS_FAILURE_ACTION__NONE ))
{
/* Do nothing for the 'alarm only' or 'none' action.
* Alarming is handled by the hbsAgent already */
return ;
}
if ( node_ptr->hbs_minor[iface] == false )
{
bool enter = false ;
bool added = false ;
node_ptr->hbs_minor[iface] = true ;
node_ptr->hbs_minor_count[iface]++ ;
mnfa_host_count[iface]++;
/* if we are active then add the node to the awol list */
if ( mnfa_active == true )
{
/* once we are mnfa_active we need to give all the
* hbs_minor=true hosts a graceful recovery token
* mnfa_graceful_recovery = true and add to the awol list */
node_ptr->mnfa_graceful_recovery = true ;
added = true ;
add_host_to_awol_list (node_ptr->hostname, mnfa_awol_list );
if ( node_ptr->task != MTC_TASK_RECOVERY_WAIT )
mtcInvApi_update_task ( node_ptr, MTC_TASK_RECOVERY_WAIT );
}
else if (( mnfa_active == false ) &&
( mnfa_host_count[iface] >= this->mnfa_threshold))
{
enter = true ;
}
ilog ("%s MNFA %s (%s) %d enabled hosts (threshold:%d) (%d:%s:%d) (%d:%s:%d)\n",
node_ptr->hostname.c_str(),
added ? "added to pool" : "new candidate",
get_iface_name_str(iface),
enabled_nodes(),
mnfa_threshold,
mnfa_host_count[MGMNT_IFACE],
get_iface_name_str(MGMNT_IFACE),
node_ptr->hbs_minor_count[MGMNT_IFACE],
mnfa_host_count[CLSTR_IFACE],
get_iface_name_str(CLSTR_IFACE),
node_ptr->hbs_minor_count[CLSTR_IFACE]);
log_mnfa_pool ( mnfa_awol_list );
if ( enter == true )
{
mnfa_enter ();
}
}
}
/*****************************************************************************
*
* Name : mnfa_recover_host
*
* Description: Recover a host that may or may not be in the mnfa
* pool by sending it into the graceful recover FSM.
*
*****************************************************************************/
void nodeLinkClass::mnfa_recover_host ( struct nodeLinkClass::node * node_ptr )
{
if ( node_ptr->availStatus == MTC_AVAIL_STATUS__DEGRADED )
{
if ( node_ptr->degrade_mask == 0 )
{
availStatusChange ( node_ptr, MTC_AVAIL_STATUS__AVAILABLE );
}
}
if ( node_ptr->mnfa_graceful_recovery == true )
{
if ( node_ptr->adminAction != MTC_ADMIN_ACTION__RECOVER )
{
ilog ("%s graceful recovery (graceful recover count:%d)",
node_ptr->hostname.c_str(),
node_ptr->graceful_recovery_counter);
}
else
{
wlog ("%s graceful recovery restart (graceful recover count:%d)",
node_ptr->hostname.c_str(),
node_ptr->graceful_recovery_counter );
}
recoveryStageChange ( node_ptr, MTC_RECOVERY__START );
adminActionChange ( node_ptr, MTC_ADMIN_ACTION__RECOVER );
}
}
/****************************************************************************
*
* Name : mnfa_enter
*
* Description: Perform the operations required to enter mnfa mode
*
* These include ...
*
* 1. Send the backoff command to heartbeat service. This tells the
* heartbeat service to send heartbeat requests less frequently.
*
* 2. Set mode active
*
* 3. Store all the hosts that have failed into the mnfa_awol_list
*
* 4. Give each enabled host with hbs_minor=true the
* mnfa_graceful_recovery token
*
* 5. Start the MNFA Auto-Recovery timer with time based on the config
* setting mnfa_timeout
*
****************************************************************************/
void nodeLinkClass::mnfa_enter ( void )
{
wlog ("MNFA ENTER --> Entering Multi-Node Failure Avoidance\n");
mtcAlarm_log ( active_controller_hostname , MTC_LOG_ID__EVENT_MNFA_ENTER );
mnfa_active = true ;
mnfa_backoff = true ;
send_hbs_command ( my_hostname, MTC_BACKOFF_HBS );
/* Handle the case where we are already trying to recover from a
* previous mnfa but the failure case occurs again. If that
* happens we need to cancel the timer that will issue
* the period recovery command. */
mtcTimer_reset ( mtcTimer_mnfa );
/* Loop through inventory and recover each host that
* remains in the hbs_minor state.
* Clear heartbeat degrades */
for ( struct node * ptr = head ; ; ptr = ptr->next )
{
if ((( ptr->hbs_minor[MGMNT_IFACE] == true ) ||
( ptr->hbs_minor[CLSTR_IFACE] == true )) &&
( ptr->operState == MTC_OPER_STATE__ENABLED ))
{
/* Give all the hosts in the mnfa list a graceful
* recovery token mnfa_graceful_recovery = true
* basically a get out of double reset free card */
ptr->mnfa_graceful_recovery = true ;
add_host_to_awol_list (ptr->hostname, mnfa_awol_list );
if ( ptr->task != MTC_TASK_RECOVERY_WAIT )
mtcInvApi_update_task ( ptr, MTC_TASK_RECOVERY_WAIT );
}
if (( ptr->next == NULL ) || ( ptr == tail ))
break ;
}
if ( this->mnfa_timeout )
{
wlog ("MNFA Auto-Recovery in %d seconds\n", this->mnfa_timeout);
mtcTimer_start ( mtcTimer_mnfa, mtcTimer_handler, this->mnfa_timeout);
}
else
{
this->mtcTimer_mnfa.ring = false ;
}
log_mnfa_pool ( mnfa_awol_list );
}
/****************************************************************************
*
* Name : mnfa_exit
*
* Description: Perform the operations required to exit mnfa mode
* These include ...
*
* 1. manage mnfa counters/oms
*
* 2. disable mnfa mode (mnfa_active = false)
*
* 3. Start the heartbeat recovery timer. This is a timer that
* adds a bit of debounce to the recovery.
* After MTC_MNFA_RECOVERY_TIMER time period mtce will send
* a command to the heartbeat service commanding it to
* re-instate the default/runtime heartbeat period.
*
* 4. Loop through all the enabled inventory and clear the heartbeat
* degrade conditions and issue a heartbeat restart to any
* hosts that remain in the hbs_minor state.
*
* if ( force == true )
* The mnfa_timeout has expired
* All hosts in the awol list are forced failed and into the
* enable_handler FSM.
* else
* The mnfa recovery threshold has crossed
* Send all enabled hosts in the hbs_minor=true state into the
* graceful recovery FSM
*
****************************************************************************/
void nodeLinkClass::mnfa_exit ( bool force )
{
if ( mnfa_active == true )
{
mnfa_occurances++ ;
mnfa_active = false ;
if ( force == true )
{
elog ("... MNFA %d sec timeout - forcing full enable on ... \n",
this->mnfa_timeout);
}
wlog ("MNFA EXIT <-- Exiting Multi-Node Failure Avoidance %s\n",
force ? "(Auto-Recover)" : "");
mtcAlarm_log ( active_controller_hostname , MTC_LOG_ID__EVENT_MNFA_EXIT );
log_mnfa_pool ( mnfa_awol_list );
/* Loop through inventory and recover each host that
* remains in the hbs_minor state.
* Clear heartbeat degrades */
for ( struct node * ptr = head ; ; ptr = ptr->next )
{
std::list<string>::iterator mnfa_awol_ptr ;
for ( mnfa_awol_ptr = mnfa_awol_list.begin() ;
mnfa_awol_ptr != mnfa_awol_list.end() ;
mnfa_awol_ptr++ )
{
/* skip host if not in the mnfa pool */
if ( ptr->hostname.compare(*(mnfa_awol_ptr)) )
continue ;
if ((( ptr->hbs_minor[CLSTR_IFACE] == true ) ||
( ptr->hbs_minor[MGMNT_IFACE] == true )) &&
( ptr->operState == MTC_OPER_STATE__ENABLED ))
{
ptr->hbs_minor[MGMNT_IFACE] = false ;
ptr->hbs_minor[CLSTR_IFACE] = false ;
if ( force == true )
{
elog ("... %s failed ; auto-recovering\n",
ptr->hostname.c_str());
/* Set node as failed */
availStatusChange ( ptr, MTC_AVAIL_STATUS__FAILED );
enableStageChange ( ptr, MTC_ENABLE__START );
adminActionChange ( ptr, MTC_ADMIN_ACTION__NONE );
}
else
{
mnfa_recover_host ( ptr );
}
}
break ;
}
if (( ptr->next == NULL ) || ( ptr == tail ))
break ;
}
/* Stop the ... failure -> full enable ... window timer if it is active */
mtcTimer_reset ( mtcTimer_mnfa );
/* Start the timer that will eventually send the MTC_RECOVER_HBS command */
mtcTimer_start ( mtcTimer_mnfa, mtcTimer_handler, MTC_MNFA_RECOVERY_TIMER );
}
mnfa_host_count[MGMNT_IFACE] = 0 ;
mnfa_host_count[CLSTR_IFACE] = 0 ;
mnfa_awol_list.clear();
}
/****************************************************************************
*
* Name : mnfa_cancel
*
* Description: Cancel MNFA if its active.
*
****************************************************************************/
void nodeLinkClass::mnfa_cancel ( void )
{
if ( this->mnfa_active )
{
wlog ("MNFA CANCEL --> Cancelling Multi-Node Failure Avoidance\n");
mtcTimer_reset ( this->mtcTimer_mnfa );
/* Loop through MNFA Pool.
* Clear MNFA attributes from hosts in the pool. */
std::list<string>::iterator mnfa_awol_ptr ;
for ( mnfa_awol_ptr = mnfa_awol_list.begin() ;
mnfa_awol_ptr != mnfa_awol_list.end() ;
mnfa_awol_ptr++ )
{
struct node * node_ptr = nodeLinkClass::getNode ( *(mnfa_awol_ptr) );
if ( node_ptr != NULL )
{
node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_MGMNT ;
node_ptr->degrade_mask &= ~DEGRADE_MASK_HEARTBEAT_CLSTR ;
node_ptr->hbs_minor[CLSTR_IFACE] = false ;
node_ptr->hbs_minor[MGMNT_IFACE] = false ;
node_ptr->mnfa_graceful_recovery = false ;
mtcInvApi_update_task ( node_ptr, "" );
}
}
send_hbs_command ( this->my_hostname, MTC_RECOVER_HBS );
this->mnfa_host_count[MGMNT_IFACE] = 0 ;
this->mnfa_host_count[CLSTR_IFACE] = 0 ;
this->mnfa_active = false ;
}
mnfa_awol_list.clear();
}
/**************************************************************************
*
* Name : mnfa_recovery_handler
*
* Purpose : Handle recovery from mnfa
*
* Description: This handler is called from the main loop to handle
* exiting MNFA and scheduling a timer to send the recover
* command to hbsAgent at base level.
*
* Assumptions: Need to send the recover command to hbsAgent at base level.
*
* If mnfa is timer driven ( mnfa_timeout != 0 ) then exit
* from mnfa happens within the mnfa timer handler which
* should not be sending messages.
*
**************************************************************************/
void nodeLinkClass::mnfa_recovery_handler ( string & hostname )
{
/* if the multi-Node-Failure Avoidance timer rang
* then run the recovery handler */
if ( this->mtcTimer_mnfa.ring == true )
{
/* rang due to mnfa_timeout */
if ( this->mnfa_active == true )
{
mtcTimer_mnfa.ring = false ;
mnfa_exit ( true );
}
/* rang due to 3 second recovery timer set in mnfa_exit */
else if ( this->mnfa_backoff == true )
{
ilog("%s heartbeat backoff recovery", hostname.c_str())
if ( send_hbs_command ( my_hostname, MTC_RECOVER_HBS ) == PASS )
{
this->mnfa_backoff = false ;
}
else
{
int retry_timeout = MTC_SECS_30 ;
/* in the case of a send failure, to avoid log flooding,
* start the timer again in 30 seconds */
mtcTimer_start ( mtcTimer_mnfa, mtcTimer_handler, retry_timeout );
ilog("%s heartbeat backoff recovery command send failed, retrying in %d secs",
hostname.c_str(), retry_timeout);
}
}
}
}