Add unhealthy state recovery audit to service management (sm)
Service Management (SM) monitors connectivity and health of its peer controller over the OAM, Mgmt and (if provisioned) Cluster-Host networks. If SM sees all the links to its peer go 'carrier down' virtually simultaneously, it is possible that both controllers might simultaneously declare themselves unhealthy and both go disabled; i.e. shutdown all services with no automatic recovery. This update adds an 'Unhealthy State Recovery Audit' to SM which forces a self restart when all of its monitored links recover for cases where both controllers go unhealthy-shutdown or both controllers remain active in split-brain. Test Plan: PASS: Verify AIO SX install PASS: Verify Standard system install and unhealthy state recovery PASS: Verify single link failure end to end behavior PASS: Verify 2 of 3 link failure end to end behavior PASS: Verify all link failure end to end behavior PASS: Verify SM and Mtce heartbeat recovery over unhealthy state recovery PASS: Verify swact back and forth following a recovery PASS: Verify process restart as part of unhealthy state recovery PASS: Verify AIO DX install and unhealthy state recovery Change-Id: Ie906eaf04bec607328b7e0af09b37fa0558e3bbe Closes-Bug: 1883004 Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
parent
3b68098be4
commit
630a777cbb
|
@ -1,5 +1,5 @@
|
|||
//
|
||||
// Copyright (c) 2014-2018 Wind River Systems, Inc.
|
||||
// Copyright (c) 2014-2020 Wind River Systems, Inc.
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
@ -581,3 +581,17 @@ SmErrorT sm_node_utils_is_aio_duplex( bool* is_aio_duplex )
|
|||
return SM_OKAY;
|
||||
}
|
||||
|
||||
// ****************************************************************************
|
||||
// Node Utilities - Clear the unhealthy flag
|
||||
// ==============================
|
||||
extern void sm_node_utils_reset_unhealthy_flag( void )
|
||||
{
|
||||
if( 0 == access( SM_NODE_UNHEALTHY_FILE, F_OK ) )
|
||||
{
|
||||
unlink( SM_NODE_UNHEALTHY_FILE );
|
||||
if( 0 == access( SM_NODE_UNHEALTHY_FILE, F_OK ) )
|
||||
{
|
||||
DPRINTFE("file did not get removed ; %s", SM_NODE_UNHEALTHY_FILE);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
//
|
||||
// Copyright (c) 2014 Wind River Systems, Inc.
|
||||
// Copyright (c) 2014,2020 Wind River Systems, Inc.
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
@ -104,6 +104,12 @@ extern SmErrorT sm_node_utils_is_aio_duplex( bool* is_aio_duplex );
|
|||
extern bool sm_node_utils_set_failover( bool to_disable );
|
||||
// ****************************************************************************
|
||||
|
||||
// ****************************************************************************
|
||||
// Node Utilities - Clear the unhealthy flag
|
||||
// ==============================
|
||||
extern void sm_node_utils_reset_unhealthy_flag( void );
|
||||
// ****************************************************************************
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
//
|
||||
// Copyright (c) 2014-2018 Wind River Systems, Inc.
|
||||
// Copyright (c) 2014-2020 Wind River Systems, Inc.
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
@ -173,6 +173,7 @@ _sm_failover_event_mappings[SM_FAILOVER_EVENT_MAX] =
|
|||
{SM_FAILOVER_EVENT_HEARTBEAT_ENABLED, "heartbeat-enabled"},
|
||||
{SM_FAILOVER_EVENT_IF_STATE_CHANGED, "interface-state-changed"},
|
||||
{SM_FAILOVER_EVENT_FAIL_PENDING_TIMEOUT, "fail-pending-timeout"},
|
||||
{SM_FAILOVER_EVENT_FAILED_RECOVERY_AUDIT, "failed-recovery-audit"},
|
||||
{SM_FAILOVER_EVENT_NODE_ENABLED, "node-enabled"}
|
||||
};
|
||||
|
||||
|
@ -186,6 +187,15 @@ _sm_failover_state_mappings[SM_FAILOVER_STATE_MAX] =
|
|||
{SM_FAILOVER_STATE_SURVIVED, "survived"}
|
||||
};
|
||||
|
||||
static SmValueStrMappingT
|
||||
_sm_failover_interface_state_mappings[SM_FAILOVER_INTERFACE_STATE_MAX] =
|
||||
{
|
||||
{SM_FAILOVER_INTERFACE_UNKNOWN, "unknown"},
|
||||
{SM_FAILOVER_INTERFACE_OK, "ok"},
|
||||
{SM_FAILOVER_INTERFACE_MISSING_HEARTBEAT, "missing-heartbeat"},
|
||||
{SM_FAILOVER_INTERFACE_DOWN, "down"}
|
||||
};
|
||||
|
||||
static SmValueStrMappingT
|
||||
_sm_service_domain_neighbor_state_mappings[SM_SERVICE_DOMAIN_NEIGHBOR_STATE_MAX] =
|
||||
{
|
||||
|
@ -993,6 +1003,17 @@ const char* sm_failover_state_str( SmFailoverStateT state )
|
|||
SM_FAILOVER_STATE_MAX,
|
||||
state ) );
|
||||
}
|
||||
|
||||
// ****************************************************************************
|
||||
// Types - Failover Interface State String
|
||||
// =============================================
|
||||
const char* sm_failover_interface_state_str( SmFailoverInterfaceStateT state )
|
||||
{
|
||||
return( sm_mapping_get_str( _sm_failover_interface_state_mappings,
|
||||
SM_FAILOVER_INTERFACE_STATE_MAX,
|
||||
state ) );
|
||||
}
|
||||
|
||||
// ****************************************************************************
|
||||
|
||||
// ****************************************************************************
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
//
|
||||
// Copyright (c) 2014-2018 Wind River Systems, Inc.
|
||||
// Copyright (c) 2014-2020 Wind River Systems, Inc.
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
@ -293,6 +293,7 @@ typedef enum{
|
|||
SM_FAILOVER_EVENT_HEARTBEAT_ENABLED,
|
||||
SM_FAILOVER_EVENT_IF_STATE_CHANGED,
|
||||
SM_FAILOVER_EVENT_FAIL_PENDING_TIMEOUT,
|
||||
SM_FAILOVER_EVENT_FAILED_RECOVERY_AUDIT,
|
||||
SM_FAILOVER_EVENT_NODE_ENABLED,
|
||||
SM_FAILOVER_EVENT_MAX
|
||||
}SmFailoverEventT;
|
||||
|
@ -730,7 +731,8 @@ typedef enum
|
|||
SM_FAILOVER_INTERFACE_UNKNOWN,
|
||||
SM_FAILOVER_INTERFACE_OK,
|
||||
SM_FAILOVER_INTERFACE_MISSING_HEARTBEAT,
|
||||
SM_FAILOVER_INTERFACE_DOWN
|
||||
SM_FAILOVER_INTERFACE_DOWN,
|
||||
SM_FAILOVER_INTERFACE_STATE_MAX
|
||||
}SmFailoverInterfaceStateT;
|
||||
|
||||
// ****************************************************************************
|
||||
|
@ -960,6 +962,10 @@ extern const char* sm_failover_event_str( SmFailoverEventT event );
|
|||
extern const char* sm_failover_state_str( SmFailoverStateT state );
|
||||
// ****************************************************************************
|
||||
|
||||
const char* sm_failover_interface_state_str( SmFailoverInterfaceStateT state );
|
||||
|
||||
// ****************************************************************************
|
||||
|
||||
// ****************************************************************************
|
||||
// Types - Service Domain Neighbor State Value
|
||||
// ===========================================
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
//
|
||||
// Copyright (c) 2017 Wind River Systems, Inc.
|
||||
// Copyright (c) 2017-2020 Wind River Systems, Inc.
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
|
|
@ -231,6 +231,10 @@ SmErrorT SmFailoverFailPendingState::event_handler(SmFailoverEventT event, const
|
|||
{
|
||||
blind_guess_scenario_start();
|
||||
}
|
||||
else
|
||||
{
|
||||
this->fsm.set_state(SM_FAILOVER_STATE_FAILED);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
|
|
|
@ -3,24 +3,203 @@
|
|||
//
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
#include "sm_failover_failed_state.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
#include <unistd.h>
|
||||
#include <limits.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/time.h>
|
||||
#include <sys/resource.h>
|
||||
#include <sys/stat.h>
|
||||
#include <errno.h>
|
||||
#include <fcntl.h>
|
||||
|
||||
#include "sm_failover_failed_state.h"
|
||||
#include "sm_types.h"
|
||||
#include "sm_debug.h"
|
||||
#include "sm_node_utils.h"
|
||||
#include "sm_failover.h"
|
||||
#include "sm_failover_fsm.h"
|
||||
#include "sm_failover_ss.h"
|
||||
#include "sm_failover_utils.h"
|
||||
|
||||
extern bool is_cluster_host_interface_configured( void );
|
||||
|
||||
// Failover Failed Recovery Audit period = 5 seconds
|
||||
static const int FAILED_STATE_AUDIT_PERIOD = 5000;
|
||||
|
||||
// Recovery log throttle threshold - 1 log every minute
|
||||
static const int SM_FAILOVER_FAILED_LOG_THROTTLE_THLD = 12;
|
||||
|
||||
// processes to restart over a failover failed recovery
|
||||
#define MAX_RESTART_PROCESS_NAME_LEN 10
|
||||
#define PROCESS_HBSAGENT ((const char *)("hbsAgent"))
|
||||
#define PROCESS_SM ((const char *)("sm"))
|
||||
|
||||
// Failover Failed state class constructor
|
||||
SmFailoverFailedState::SmFailoverFailedState(SmFailoverFSM& fsm) : SmFSMState(fsm)
|
||||
{
|
||||
this->_failed_state_audit_timer_id = SM_TIMER_ID_INVALID;
|
||||
}
|
||||
|
||||
// The 'Failover Failed' state destructor
|
||||
// - stops the recovery audit if needed
|
||||
SmFailoverFailedState::~SmFailoverFailedState()
|
||||
{
|
||||
this->_deregister_timer();
|
||||
}
|
||||
|
||||
// Failover Failed state entry class member function
|
||||
// - starts the Failover Failed state recovery audit timer
|
||||
SmErrorT SmFailoverFailedState::enter_state()
|
||||
{
|
||||
SmFSMState::enter_state();
|
||||
|
||||
DPRINTFE("********************************************************");
|
||||
DPRINTFE("Entering Failover Failed state ; recovery audit started ");
|
||||
DPRINTFE("********************************************************");
|
||||
|
||||
SmErrorT error = this->_register_timer();
|
||||
if(SM_OKAY != error)
|
||||
{
|
||||
DPRINTFE("Failed to register failed state timer. Error %s", sm_error_str(error));
|
||||
}
|
||||
return error;
|
||||
}
|
||||
|
||||
// Failover Failed state audit timer handler
|
||||
bool SmFailoverFailedState::_failed_state_audit(
|
||||
SmTimerIdT timer_id, int64_t user_data)
|
||||
{
|
||||
SmFailoverFSM::get_fsm().send_event(SM_FAILOVER_EVENT_FAILED_RECOVERY_AUDIT, NULL);
|
||||
return true ;
|
||||
}
|
||||
|
||||
// Issue a self restart through pmon-restart service
|
||||
static bool sm_failover_failed_process_restart( const char * process )
|
||||
{
|
||||
DPRINTFI( "Issuing controlled process restart ; pmon-restart %s", process);
|
||||
pid_t pid = fork();
|
||||
if( 0 > pid )
|
||||
{
|
||||
DPRINTFE( "Failed to fork 'pmond-restart %s' request, error=%s.",
|
||||
process, strerror( errno ) );
|
||||
return( true );
|
||||
}
|
||||
else if( 0 == pid )
|
||||
{
|
||||
// set the arguement array for execv
|
||||
char pmon_restart_cmd[] = "/usr/local/sbin/pmon-restart";
|
||||
|
||||
char pmon_restart_process[MAX_RESTART_PROCESS_NAME_LEN] ;
|
||||
snprintf(&pmon_restart_process[0], MAX_RESTART_PROCESS_NAME_LEN, "%s", process);
|
||||
|
||||
char* pmon_restart_argv[3] ;
|
||||
pmon_restart_argv[0] = pmon_restart_cmd;
|
||||
pmon_restart_argv[1] = pmon_restart_process;
|
||||
pmon_restart_argv[2] = NULL;
|
||||
|
||||
// Add the path to socat for pmon-restart
|
||||
char path[] = "PATH=/usr/bin:$PATH";
|
||||
char* pmon_restart_env[2] ;
|
||||
pmon_restart_env[0] = path;
|
||||
pmon_restart_env[1] = NULL;
|
||||
|
||||
setpgid( 0, 0 );
|
||||
|
||||
struct rlimit file_limits;
|
||||
if( 0 == getrlimit( RLIMIT_NOFILE, &file_limits ) )
|
||||
{
|
||||
unsigned int fd_i;
|
||||
for( fd_i=0; fd_i < file_limits.rlim_cur; ++fd_i )
|
||||
{
|
||||
close( fd_i );
|
||||
}
|
||||
open( "/dev/null", O_RDONLY ); // stdin
|
||||
open( "/dev/null", O_WRONLY ); // stdout
|
||||
open( "/dev/null", O_WRONLY ); // stderr
|
||||
}
|
||||
|
||||
execve( pmon_restart_argv[0], pmon_restart_argv, pmon_restart_env );
|
||||
|
||||
// Shouldn't get this far, else there was an error.
|
||||
exit(-1);
|
||||
}
|
||||
return( false );
|
||||
}
|
||||
|
||||
// Failover Failed recovery criteria checker
|
||||
static bool sm_failover_failed_recovery_criteria_met( void )
|
||||
{
|
||||
bool criteria_met = false ;
|
||||
|
||||
SmFailoverInterfaceStateT oam_state, mgmt_state, cluster_host_state;
|
||||
oam_state = sm_failover_get_interface_info(SM_INTERFACE_OAM);
|
||||
mgmt_state = sm_failover_get_interface_info(SM_INTERFACE_MGMT);
|
||||
|
||||
if ( is_cluster_host_interface_configured() )
|
||||
{
|
||||
cluster_host_state = sm_failover_get_interface_info(SM_INTERFACE_CLUSTER_HOST);
|
||||
if ((( oam_state == SM_FAILOVER_INTERFACE_OK ) || ( oam_state == SM_FAILOVER_INTERFACE_MISSING_HEARTBEAT )) &&
|
||||
(( mgmt_state == SM_FAILOVER_INTERFACE_OK ) || ( mgmt_state == SM_FAILOVER_INTERFACE_MISSING_HEARTBEAT )) &&
|
||||
(( cluster_host_state == SM_FAILOVER_INTERFACE_OK ) || ( cluster_host_state == SM_FAILOVER_INTERFACE_MISSING_HEARTBEAT )))
|
||||
{
|
||||
criteria_met = true ;
|
||||
}
|
||||
}
|
||||
else if ((( oam_state == SM_FAILOVER_INTERFACE_OK ) || ( oam_state == SM_FAILOVER_INTERFACE_MISSING_HEARTBEAT )) &&
|
||||
(( mgmt_state == SM_FAILOVER_INTERFACE_OK ) || ( mgmt_state == SM_FAILOVER_INTERFACE_MISSING_HEARTBEAT )))
|
||||
{
|
||||
criteria_met = true ;
|
||||
}
|
||||
|
||||
DPRINTFI("Oam:%s ; Mgmt:%s ; Cluster:%s ; recovery criteria met: %s",
|
||||
sm_failover_interface_state_str(oam_state),
|
||||
sm_failover_interface_state_str(mgmt_state),
|
||||
sm_failover_interface_state_str(cluster_host_state),
|
||||
criteria_met ? "Yes" : "No");
|
||||
|
||||
return (criteria_met);
|
||||
}
|
||||
|
||||
// The 'Failover Failed' state recovery audit handler
|
||||
SmErrorT SmFailoverFailedState::event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data)
|
||||
{
|
||||
// Currently the only supported scenario to recover from failure is
|
||||
// reboot triggered by mtce.
|
||||
// So once entering failed state, wait for reboot to reenter the normal state.
|
||||
event_data=event_data;
|
||||
switch (event)
|
||||
{
|
||||
case SM_FAILOVER_EVENT_IF_STATE_CHANGED:
|
||||
// event will be fired, but couldn't bring fsm state back to normal
|
||||
case SM_FAILOVER_EVENT_FAILED_RECOVERY_AUDIT:
|
||||
{
|
||||
if ( sm_failover_failed_recovery_criteria_met() )
|
||||
{
|
||||
DPRINTFI("************************************");
|
||||
DPRINTFI("** Failover Failed state recovery **");
|
||||
DPRINTFI("************************************");
|
||||
sm_node_utils_reset_unhealthy_flag();
|
||||
sm_failover_failed_process_restart(PROCESS_HBSAGENT);
|
||||
sm_failover_failed_process_restart(PROCESS_SM);
|
||||
for ( int i = 0 ; i < 10 ; i++ )
|
||||
{
|
||||
// waiting for shutdown
|
||||
sleep(1);
|
||||
}
|
||||
DPRINTFE("Restart did not occur ; reinstating unhealthy flag ; recovery will retry");
|
||||
sm_node_utils_set_unhealthy();
|
||||
}
|
||||
else if ( ++_log_throttle > 1 )
|
||||
{
|
||||
if ( _log_throttle > SM_FAILOVER_FAILED_LOG_THROTTLE_THLD )
|
||||
_log_throttle = 0 ;
|
||||
}
|
||||
else
|
||||
{
|
||||
DPRINTFI("Failover Failed state recovery monitor");
|
||||
}
|
||||
break;
|
||||
|
||||
}
|
||||
default:
|
||||
DPRINTFE("Runtime error, unexpected event %s, at state %s",
|
||||
sm_failover_event_str(event),
|
||||
|
@ -28,3 +207,58 @@ SmErrorT SmFailoverFailedState::event_handler(SmFailoverEventT event, const ISmF
|
|||
}
|
||||
return SM_OKAY;
|
||||
}
|
||||
|
||||
// Start the 'Failover Failed' state recovery audit
|
||||
SmErrorT SmFailoverFailedState::_register_timer()
|
||||
{
|
||||
SmErrorT error;
|
||||
const char* timer_name = "FAILED STATE AUDIT TIMER";
|
||||
if(SM_TIMER_ID_INVALID != this->_failed_state_audit_timer_id)
|
||||
this->_deregister_timer();
|
||||
|
||||
error = sm_timer_register(timer_name, FAILED_STATE_AUDIT_PERIOD,
|
||||
SmFailoverFailedState::_failed_state_audit,
|
||||
0, &this->_failed_state_audit_timer_id);
|
||||
|
||||
return error;
|
||||
}
|
||||
|
||||
// Stop the 'Failover Failed' state recovery audit
|
||||
SmErrorT SmFailoverFailedState::_deregister_timer()
|
||||
{
|
||||
SmErrorT error = SM_OKAY;
|
||||
if(SM_TIMER_ID_INVALID != this->_failed_state_audit_timer_id)
|
||||
{
|
||||
error = sm_timer_deregister(this->_failed_state_audit_timer_id);
|
||||
if( SM_OKAY != error )
|
||||
{
|
||||
DPRINTFE( "Failed to cancel failed timer, error=%s.",
|
||||
sm_error_str( error ) );
|
||||
}else
|
||||
{
|
||||
this->_failed_state_audit_timer_id = SM_TIMER_ID_INVALID;
|
||||
}
|
||||
}
|
||||
return error;
|
||||
}
|
||||
|
||||
|
||||
SmErrorT SmFailoverFailedState::exit_state()
|
||||
{
|
||||
SmErrorT error = this->_deregister_timer();
|
||||
if(SM_OKAY != error)
|
||||
{
|
||||
DPRINTFE("Failed to deregister fail failed timer. Error %s", sm_error_str(error));
|
||||
}
|
||||
if(SM_TIMER_ID_INVALID != _failed_state_audit_timer_id)
|
||||
{
|
||||
error = sm_timer_deregister(_failed_state_audit_timer_id);
|
||||
_failed_state_audit_timer_id = SM_TIMER_ID_INVALID;
|
||||
if( SM_OKAY != error)
|
||||
{
|
||||
DPRINTFE("Failed to deregister action timer. Error %s", sm_error_str(error));
|
||||
}
|
||||
}
|
||||
SmFSMState::exit_state();
|
||||
return error;
|
||||
}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
//
|
||||
// Copyright (c) 2018 Wind River Systems, Inc.
|
||||
// Copyright (c) 2020 Wind River Systems, Inc.
|
||||
//
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
@ -12,12 +12,22 @@
|
|||
class SmFailoverFailedState : public SmFSMState
|
||||
{
|
||||
public:
|
||||
SmFailoverFailedState(SmFailoverFSM& fsm) : SmFSMState(fsm){}
|
||||
SmFailoverFailedState(SmFailoverFSM& fsm);
|
||||
virtual ~SmFailoverFailedState();
|
||||
SmErrorT enter_state();
|
||||
SmErrorT exit_state();
|
||||
|
||||
protected:
|
||||
SmErrorT event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data);
|
||||
|
||||
private:
|
||||
SmTimerIdT _failed_state_audit_timer_id;
|
||||
static bool _failed_state_audit(SmTimerIdT timer_id, int64_t user_data);
|
||||
SmErrorT _register_timer();
|
||||
SmErrorT _deregister_timer();
|
||||
|
||||
int _log_throttle ;
|
||||
};
|
||||
|
||||
|
||||
#endif //__SM_FAILOVER_FAILED_STATE_H__
|
||||
#endif //__SM_FAILOVER_FAILED_STATE_H__
|
||||
|
|
Loading…
Reference in New Issue