Add unhealthy state recovery audit to service management (sm)

Service Management (SM) monitors connectivity and health of
its peer controller over the OAM, Mgmt and (if provisioned)
Cluster-Host networks.

If SM sees all the links to its peer go 'carrier down' virtually
simultaneously, it is possible that both controllers might
simultaneously declare themselves unhealthy and both go
disabled; i.e. shutdown all services with no automatic recovery.

This update adds an 'Unhealthy State Recovery Audit' to SM which
forces a self restart when all of its monitored links recover
for cases where both controllers go unhealthy-shutdown or both
controllers remain active in split-brain.

Test Plan:

PASS: Verify AIO SX install
PASS: Verify Standard system install and unhealthy state recovery
PASS: Verify single link failure end to end behavior
PASS: Verify 2 of 3 link failure end to end behavior
PASS: Verify all link failure end to end behavior
PASS: Verify SM and Mtce heartbeat recovery over unhealthy state recovery
PASS: Verify swact back and forth following a recovery
PASS: Verify process restart as part of unhealthy state recovery
PASS: Verify AIO DX install and unhealthy state recovery

Change-Id: Ie906eaf04bec607328b7e0af09b37fa0558e3bbe
Closes-Bug: 1883004
Signed-off-by: Eric MacDonald <eric.macdonald@windriver.com>
This commit is contained in:
Eric MacDonald 2020-06-11 15:32:47 -04:00
parent 3b68098be4
commit 630a777cbb
8 changed files with 310 additions and 15 deletions

View File

@ -1,5 +1,5 @@
//
// Copyright (c) 2014-2018 Wind River Systems, Inc.
// Copyright (c) 2014-2020 Wind River Systems, Inc.
//
// SPDX-License-Identifier: Apache-2.0
//
@ -581,3 +581,17 @@ SmErrorT sm_node_utils_is_aio_duplex( bool* is_aio_duplex )
return SM_OKAY;
}
// ****************************************************************************
// Node Utilities - Clear the unhealthy flag
// ==============================
extern void sm_node_utils_reset_unhealthy_flag( void )
{
if( 0 == access( SM_NODE_UNHEALTHY_FILE, F_OK ) )
{
unlink( SM_NODE_UNHEALTHY_FILE );
if( 0 == access( SM_NODE_UNHEALTHY_FILE, F_OK ) )
{
DPRINTFE("file did not get removed ; %s", SM_NODE_UNHEALTHY_FILE);
}
}
}

View File

@ -1,5 +1,5 @@
//
// Copyright (c) 2014 Wind River Systems, Inc.
// Copyright (c) 2014,2020 Wind River Systems, Inc.
//
// SPDX-License-Identifier: Apache-2.0
//
@ -104,6 +104,12 @@ extern SmErrorT sm_node_utils_is_aio_duplex( bool* is_aio_duplex );
extern bool sm_node_utils_set_failover( bool to_disable );
// ****************************************************************************
// ****************************************************************************
// Node Utilities - Clear the unhealthy flag
// ==============================
extern void sm_node_utils_reset_unhealthy_flag( void );
// ****************************************************************************
#ifdef __cplusplus
}
#endif

View File

@ -1,5 +1,5 @@
//
// Copyright (c) 2014-2018 Wind River Systems, Inc.
// Copyright (c) 2014-2020 Wind River Systems, Inc.
//
// SPDX-License-Identifier: Apache-2.0
//
@ -173,6 +173,7 @@ _sm_failover_event_mappings[SM_FAILOVER_EVENT_MAX] =
{SM_FAILOVER_EVENT_HEARTBEAT_ENABLED, "heartbeat-enabled"},
{SM_FAILOVER_EVENT_IF_STATE_CHANGED, "interface-state-changed"},
{SM_FAILOVER_EVENT_FAIL_PENDING_TIMEOUT, "fail-pending-timeout"},
{SM_FAILOVER_EVENT_FAILED_RECOVERY_AUDIT, "failed-recovery-audit"},
{SM_FAILOVER_EVENT_NODE_ENABLED, "node-enabled"}
};
@ -186,6 +187,15 @@ _sm_failover_state_mappings[SM_FAILOVER_STATE_MAX] =
{SM_FAILOVER_STATE_SURVIVED, "survived"}
};
static SmValueStrMappingT
_sm_failover_interface_state_mappings[SM_FAILOVER_INTERFACE_STATE_MAX] =
{
{SM_FAILOVER_INTERFACE_UNKNOWN, "unknown"},
{SM_FAILOVER_INTERFACE_OK, "ok"},
{SM_FAILOVER_INTERFACE_MISSING_HEARTBEAT, "missing-heartbeat"},
{SM_FAILOVER_INTERFACE_DOWN, "down"}
};
static SmValueStrMappingT
_sm_service_domain_neighbor_state_mappings[SM_SERVICE_DOMAIN_NEIGHBOR_STATE_MAX] =
{
@ -993,6 +1003,17 @@ const char* sm_failover_state_str( SmFailoverStateT state )
SM_FAILOVER_STATE_MAX,
state ) );
}
// ****************************************************************************
// Types - Failover Interface State String
// =============================================
const char* sm_failover_interface_state_str( SmFailoverInterfaceStateT state )
{
return( sm_mapping_get_str( _sm_failover_interface_state_mappings,
SM_FAILOVER_INTERFACE_STATE_MAX,
state ) );
}
// ****************************************************************************
// ****************************************************************************

View File

@ -1,5 +1,5 @@
//
// Copyright (c) 2014-2018 Wind River Systems, Inc.
// Copyright (c) 2014-2020 Wind River Systems, Inc.
//
// SPDX-License-Identifier: Apache-2.0
//
@ -293,6 +293,7 @@ typedef enum{
SM_FAILOVER_EVENT_HEARTBEAT_ENABLED,
SM_FAILOVER_EVENT_IF_STATE_CHANGED,
SM_FAILOVER_EVENT_FAIL_PENDING_TIMEOUT,
SM_FAILOVER_EVENT_FAILED_RECOVERY_AUDIT,
SM_FAILOVER_EVENT_NODE_ENABLED,
SM_FAILOVER_EVENT_MAX
}SmFailoverEventT;
@ -730,7 +731,8 @@ typedef enum
SM_FAILOVER_INTERFACE_UNKNOWN,
SM_FAILOVER_INTERFACE_OK,
SM_FAILOVER_INTERFACE_MISSING_HEARTBEAT,
SM_FAILOVER_INTERFACE_DOWN
SM_FAILOVER_INTERFACE_DOWN,
SM_FAILOVER_INTERFACE_STATE_MAX
}SmFailoverInterfaceStateT;
// ****************************************************************************
@ -960,6 +962,10 @@ extern const char* sm_failover_event_str( SmFailoverEventT event );
extern const char* sm_failover_state_str( SmFailoverStateT state );
// ****************************************************************************
const char* sm_failover_interface_state_str( SmFailoverInterfaceStateT state );
// ****************************************************************************
// ****************************************************************************
// Types - Service Domain Neighbor State Value
// ===========================================

View File

@ -1,5 +1,5 @@
//
// Copyright (c) 2017 Wind River Systems, Inc.
// Copyright (c) 2017-2020 Wind River Systems, Inc.
//
// SPDX-License-Identifier: Apache-2.0
//

View File

@ -231,6 +231,10 @@ SmErrorT SmFailoverFailPendingState::event_handler(SmFailoverEventT event, const
{
blind_guess_scenario_start();
}
else
{
this->fsm.set_state(SM_FAILOVER_STATE_FAILED);
}
}
else
{

View File

@ -3,24 +3,203 @@
//
// SPDX-License-Identifier: Apache-2.0
//
#include "sm_failover_failed_state.h"
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>
#include <limits.h>
#include <sys/types.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <sys/stat.h>
#include <errno.h>
#include <fcntl.h>
#include "sm_failover_failed_state.h"
#include "sm_types.h"
#include "sm_debug.h"
#include "sm_node_utils.h"
#include "sm_failover.h"
#include "sm_failover_fsm.h"
#include "sm_failover_ss.h"
#include "sm_failover_utils.h"
extern bool is_cluster_host_interface_configured( void );
// Failover Failed Recovery Audit period = 5 seconds
static const int FAILED_STATE_AUDIT_PERIOD = 5000;
// Recovery log throttle threshold - 1 log every minute
static const int SM_FAILOVER_FAILED_LOG_THROTTLE_THLD = 12;
// processes to restart over a failover failed recovery
#define MAX_RESTART_PROCESS_NAME_LEN 10
#define PROCESS_HBSAGENT ((const char *)("hbsAgent"))
#define PROCESS_SM ((const char *)("sm"))
// Failover Failed state class constructor
SmFailoverFailedState::SmFailoverFailedState(SmFailoverFSM& fsm) : SmFSMState(fsm)
{
this->_failed_state_audit_timer_id = SM_TIMER_ID_INVALID;
}
// The 'Failover Failed' state destructor
// - stops the recovery audit if needed
SmFailoverFailedState::~SmFailoverFailedState()
{
this->_deregister_timer();
}
// Failover Failed state entry class member function
// - starts the Failover Failed state recovery audit timer
SmErrorT SmFailoverFailedState::enter_state()
{
SmFSMState::enter_state();
DPRINTFE("********************************************************");
DPRINTFE("Entering Failover Failed state ; recovery audit started ");
DPRINTFE("********************************************************");
SmErrorT error = this->_register_timer();
if(SM_OKAY != error)
{
DPRINTFE("Failed to register failed state timer. Error %s", sm_error_str(error));
}
return error;
}
// Failover Failed state audit timer handler
bool SmFailoverFailedState::_failed_state_audit(
SmTimerIdT timer_id, int64_t user_data)
{
SmFailoverFSM::get_fsm().send_event(SM_FAILOVER_EVENT_FAILED_RECOVERY_AUDIT, NULL);
return true ;
}
// Issue a self restart through pmon-restart service
static bool sm_failover_failed_process_restart( const char * process )
{
DPRINTFI( "Issuing controlled process restart ; pmon-restart %s", process);
pid_t pid = fork();
if( 0 > pid )
{
DPRINTFE( "Failed to fork 'pmond-restart %s' request, error=%s.",
process, strerror( errno ) );
return( true );
}
else if( 0 == pid )
{
// set the arguement array for execv
char pmon_restart_cmd[] = "/usr/local/sbin/pmon-restart";
char pmon_restart_process[MAX_RESTART_PROCESS_NAME_LEN] ;
snprintf(&pmon_restart_process[0], MAX_RESTART_PROCESS_NAME_LEN, "%s", process);
char* pmon_restart_argv[3] ;
pmon_restart_argv[0] = pmon_restart_cmd;
pmon_restart_argv[1] = pmon_restart_process;
pmon_restart_argv[2] = NULL;
// Add the path to socat for pmon-restart
char path[] = "PATH=/usr/bin:$PATH";
char* pmon_restart_env[2] ;
pmon_restart_env[0] = path;
pmon_restart_env[1] = NULL;
setpgid( 0, 0 );
struct rlimit file_limits;
if( 0 == getrlimit( RLIMIT_NOFILE, &file_limits ) )
{
unsigned int fd_i;
for( fd_i=0; fd_i < file_limits.rlim_cur; ++fd_i )
{
close( fd_i );
}
open( "/dev/null", O_RDONLY ); // stdin
open( "/dev/null", O_WRONLY ); // stdout
open( "/dev/null", O_WRONLY ); // stderr
}
execve( pmon_restart_argv[0], pmon_restart_argv, pmon_restart_env );
// Shouldn't get this far, else there was an error.
exit(-1);
}
return( false );
}
// Failover Failed recovery criteria checker
static bool sm_failover_failed_recovery_criteria_met( void )
{
bool criteria_met = false ;
SmFailoverInterfaceStateT oam_state, mgmt_state, cluster_host_state;
oam_state = sm_failover_get_interface_info(SM_INTERFACE_OAM);
mgmt_state = sm_failover_get_interface_info(SM_INTERFACE_MGMT);
if ( is_cluster_host_interface_configured() )
{
cluster_host_state = sm_failover_get_interface_info(SM_INTERFACE_CLUSTER_HOST);
if ((( oam_state == SM_FAILOVER_INTERFACE_OK ) || ( oam_state == SM_FAILOVER_INTERFACE_MISSING_HEARTBEAT )) &&
(( mgmt_state == SM_FAILOVER_INTERFACE_OK ) || ( mgmt_state == SM_FAILOVER_INTERFACE_MISSING_HEARTBEAT )) &&
(( cluster_host_state == SM_FAILOVER_INTERFACE_OK ) || ( cluster_host_state == SM_FAILOVER_INTERFACE_MISSING_HEARTBEAT )))
{
criteria_met = true ;
}
}
else if ((( oam_state == SM_FAILOVER_INTERFACE_OK ) || ( oam_state == SM_FAILOVER_INTERFACE_MISSING_HEARTBEAT )) &&
(( mgmt_state == SM_FAILOVER_INTERFACE_OK ) || ( mgmt_state == SM_FAILOVER_INTERFACE_MISSING_HEARTBEAT )))
{
criteria_met = true ;
}
DPRINTFI("Oam:%s ; Mgmt:%s ; Cluster:%s ; recovery criteria met: %s",
sm_failover_interface_state_str(oam_state),
sm_failover_interface_state_str(mgmt_state),
sm_failover_interface_state_str(cluster_host_state),
criteria_met ? "Yes" : "No");
return (criteria_met);
}
// The 'Failover Failed' state recovery audit handler
SmErrorT SmFailoverFailedState::event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data)
{
// Currently the only supported scenario to recover from failure is
// reboot triggered by mtce.
// So once entering failed state, wait for reboot to reenter the normal state.
event_data=event_data;
switch (event)
{
case SM_FAILOVER_EVENT_IF_STATE_CHANGED:
// event will be fired, but couldn't bring fsm state back to normal
case SM_FAILOVER_EVENT_FAILED_RECOVERY_AUDIT:
{
if ( sm_failover_failed_recovery_criteria_met() )
{
DPRINTFI("************************************");
DPRINTFI("** Failover Failed state recovery **");
DPRINTFI("************************************");
sm_node_utils_reset_unhealthy_flag();
sm_failover_failed_process_restart(PROCESS_HBSAGENT);
sm_failover_failed_process_restart(PROCESS_SM);
for ( int i = 0 ; i < 10 ; i++ )
{
// waiting for shutdown
sleep(1);
}
DPRINTFE("Restart did not occur ; reinstating unhealthy flag ; recovery will retry");
sm_node_utils_set_unhealthy();
}
else if ( ++_log_throttle > 1 )
{
if ( _log_throttle > SM_FAILOVER_FAILED_LOG_THROTTLE_THLD )
_log_throttle = 0 ;
}
else
{
DPRINTFI("Failover Failed state recovery monitor");
}
break;
}
default:
DPRINTFE("Runtime error, unexpected event %s, at state %s",
sm_failover_event_str(event),
@ -28,3 +207,58 @@ SmErrorT SmFailoverFailedState::event_handler(SmFailoverEventT event, const ISmF
}
return SM_OKAY;
}
// Start the 'Failover Failed' state recovery audit
SmErrorT SmFailoverFailedState::_register_timer()
{
SmErrorT error;
const char* timer_name = "FAILED STATE AUDIT TIMER";
if(SM_TIMER_ID_INVALID != this->_failed_state_audit_timer_id)
this->_deregister_timer();
error = sm_timer_register(timer_name, FAILED_STATE_AUDIT_PERIOD,
SmFailoverFailedState::_failed_state_audit,
0, &this->_failed_state_audit_timer_id);
return error;
}
// Stop the 'Failover Failed' state recovery audit
SmErrorT SmFailoverFailedState::_deregister_timer()
{
SmErrorT error = SM_OKAY;
if(SM_TIMER_ID_INVALID != this->_failed_state_audit_timer_id)
{
error = sm_timer_deregister(this->_failed_state_audit_timer_id);
if( SM_OKAY != error )
{
DPRINTFE( "Failed to cancel failed timer, error=%s.",
sm_error_str( error ) );
}else
{
this->_failed_state_audit_timer_id = SM_TIMER_ID_INVALID;
}
}
return error;
}
SmErrorT SmFailoverFailedState::exit_state()
{
SmErrorT error = this->_deregister_timer();
if(SM_OKAY != error)
{
DPRINTFE("Failed to deregister fail failed timer. Error %s", sm_error_str(error));
}
if(SM_TIMER_ID_INVALID != _failed_state_audit_timer_id)
{
error = sm_timer_deregister(_failed_state_audit_timer_id);
_failed_state_audit_timer_id = SM_TIMER_ID_INVALID;
if( SM_OKAY != error)
{
DPRINTFE("Failed to deregister action timer. Error %s", sm_error_str(error));
}
}
SmFSMState::exit_state();
return error;
}

View File

@ -1,5 +1,5 @@
//
// Copyright (c) 2018 Wind River Systems, Inc.
// Copyright (c) 2020 Wind River Systems, Inc.
//
// SPDX-License-Identifier: Apache-2.0
//
@ -12,12 +12,22 @@
class SmFailoverFailedState : public SmFSMState
{
public:
SmFailoverFailedState(SmFailoverFSM& fsm) : SmFSMState(fsm){}
SmFailoverFailedState(SmFailoverFSM& fsm);
virtual ~SmFailoverFailedState();
SmErrorT enter_state();
SmErrorT exit_state();
protected:
SmErrorT event_handler(SmFailoverEventT event, const ISmFSMEventData* event_data);
private:
SmTimerIdT _failed_state_audit_timer_id;
static bool _failed_state_audit(SmTimerIdT timer_id, int64_t user_data);
SmErrorT _register_timer();
SmErrorT _deregister_timer();
int _log_throttle ;
};
#endif //__SM_FAILOVER_FAILED_STATE_H__
#endif //__SM_FAILOVER_FAILED_STATE_H__